From 9634977303608833bb41437ee310bbdf37976637 Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Sat, 24 Jul 2021 20:25:18 +0800
Subject: [PATCH] [WIP] auto code-format bot (#849)

* Create code-format.yml

* trigger on push only

* Update code-format.yml

* apply code-format changes
---
 .clang-format                                 |  298 +-
 .github/workflows/code-format.yml             |   21 +
 benchmark/common/cmdline.hpp                  | 1412 ++++---
 examples/common/common.h                      |   10 +-
 examples/common/compiler_fp16.h               |   20 +-
 examples/common/msc_getopt.h                  |  735 ++--
 examples/common/stb_image.h                   | 3627 ++++++++---------
 examples/common/stb_image_write.h             |  787 ++--
 examples/common/tengine_operations.c          |  134 +-
 examples/common/test_nnie_all.hpp             |   48 +-
 examples/cpp_tm_classification.cpp            |   91 +-
 examples/cpp_tm_mobilenet_ssd.cpp             |   50 +-
 examples/tm_alphapose.cpp                     |  135 +-
 examples/tm_classification.c                  |   88 +-
 examples/tm_classification_acl.c              |   88 +-
 examples/tm_classification_cuda.cpp           |   92 +-
 examples/tm_classification_fp16.c             |   84 +-
 examples/tm_classification_int8.c             |   88 +-
 examples/tm_classification_timvx.c            |   96 +-
 examples/tm_classification_trt.cpp            |   91 +-
 examples/tm_classification_uint8.c            |   88 +-
 examples/tm_classification_vulkan.c           |   80 +-
 examples/tm_crnn.cpp                          |   48 +-
 examples/tm_efficientdet.c                    |  258 +-
 examples/tm_efficientdet_uint8.c              |  266 +-
 examples/tm_hrnet.cpp                         |  175 +-
 examples/tm_hrnet_timvx.cpp                   |  204 +-
 examples/tm_landmark.cpp                      |   46 +-
 examples/tm_landmark_timvx.cpp                |   46 +-
 examples/tm_landmark_uint8.cpp                |   46 +-
 examples/tm_mobilefacenet.cpp                 |   32 +-
 examples/tm_mobilefacenet_uint8.cpp           |   46 +-
 examples/tm_mobilenet_ssd.c                   |   56 +-
 examples/tm_mobilenet_ssd_acl.c               |   56 +-
 examples/tm_mobilenet_ssd_uint8.cpp           |   66 +-
 examples/tm_nanodet_m.cpp                     |  234 +-
 examples/tm_nanodet_m_timvx.cpp               |  259 +-
 examples/tm_openpose.cpp                      |   67 +-
 examples/tm_retinaface.cpp                    |   86 +-
 examples/tm_ultraface.cpp                     |  121 +-
 examples/tm_unet.cpp                          |  206 +-
 examples/tm_yolact.cpp                        |   73 +-
 examples/tm_yolact_uint8.cpp                  |  112 +-
 examples/tm_yolofastest.cpp                   |   53 +-
 examples/tm_yolov3.cpp                        |   74 +-
 examples/tm_yolov3_tiny.cpp                   |   70 +-
 examples/tm_yolov3_tiny_uint8.cpp             |   76 +-
 examples/tm_yolov3_uint8.cpp                  |   91 +-
 examples/tm_yolov4.cpp                        |   78 +-
 examples/tm_yolov4_tiny.cpp                   |   73 +-
 examples/tm_yolov4_tiny_timvx.cpp             |   80 +-
 examples/tm_yolov4_tiny_uint8.cpp             |   80 +-
 examples/tm_yolov4_uint8.cpp                  |   94 +-
 examples/tm_yolov5.cpp                        |   66 +-
 examples/tm_yolov5s.cpp                       |  120 +-
 examples/tm_yolov5s_timvx.cpp                 |  125 +-
 examples/tm_yolox.cpp                         |  114 +-
 examples/tm_yolox_timvx.cpp                   | 1153 +++---
 source/api/c_api.c                            |  167 +-
 source/api/c_api.h                            |  101 +-
 source/api/plugin.c                           |    9 +-
 source/device/acl/acl_define.h                |    3 +-
 source/device/acl/acl_device.hpp              |    3 +-
 source/device/acl/acl_executor.hpp            |   15 +-
 source/device/acl/acl_graph.hpp               |    4 +-
 source/device/acl/acl_limit.hpp               |   48 +-
 source/device/cpu/cpu_define.h                |   35 +-
 source/device/cpu/cpu_device.c                |   74 +-
 source/device/cpu/cpu_device.h                |    3 -
 source/device/cpu/cpu_dump.c                  |  614 ++-
 source/device/cpu/cpu_dump.h                  |    1 -
 source/device/cpu/cpu_graph.c                 |   14 +-
 source/device/cpu/cpu_graph.h                 |   26 +-
 source/device/cpu/cpu_module.c                |   17 +-
 source/device/cpu/cpu_module.h                |    2 -
 source/device/cpu/cpu_node.c                  |    4 +-
 source/device/cpu/cpu_node.h                  |   17 +-
 source/device/cpu/cpu_pool.c                  |   37 +-
 source/device/cpu/cpu_pool.h                  |   10 +-
 source/device/cpu/op/absval/absval_ref.c      |   11 +-
 .../cpu/op/absval/cortex-a/absval_hcl_arm.c   |   12 +-
 source/device/cpu/op/add_n/add_n_ref.c        |   15 +-
 source/device/cpu/op/argmax/argmax_ref.c      |   15 +-
 source/device/cpu/op/argmin/argmin_ref.c      |   15 +-
 .../cpu/op/batchnorm/batchnorm_kernel_ref.h   |    1 -
 .../op/batchnorm/batchnorm_kernel_ref_fp32.c  |    1 -
 .../op/batchnorm/batchnorm_kernel_ref_uint8.c |    9 +-
 .../device/cpu/op/batchnorm/batchnorm_ref.c   |   30 +-
 .../op/batchnorm/cortex-a/batchnorm_hcl_arm.c |   28 +-
 .../batchnorm/cortex-a/batchnorm_kernel_arm.c |   11 +-
 .../batchnorm/cortex-a/batchnorm_kernel_arm.h |    1 -
 .../op/batchtospacend/batchtospacend_ref.c    |   15 +-
 source/device/cpu/op/bias/bias_ref.c          |    1 -
 source/device/cpu/op/broadmul/broadmul_ref.c  |    2 -
 source/device/cpu/op/cast/cast_ref.c          |    2 -
 source/device/cpu/op/ceil/ceil_ref.c          |   11 +-
 source/device/cpu/op/clip/clip_kernel_ref.h   |    2 -
 .../device/cpu/op/clip/clip_kernel_ref_fp32.c |    1 -
 .../cpu/op/clip/clip_kernel_ref_uint8.c       |   13 +-
 source/device/cpu/op/clip/clip_ref.c          |    3 +-
 .../comparison/comparison_kernel_ref_fp32.c   |   61 +-
 .../device/cpu/op/comparison/comparison_ref.c |    3 +-
 .../device/cpu/op/concat/concat_kernel_ref.h  |    2 -
 .../cpu/op/concat/concat_kernel_ref_fp32.c    |   71 +-
 .../cpu/op/concat/concat_kernel_ref_int8.c    |  105 +-
 .../cpu/op/concat/concat_kernel_ref_uint8.c   |  111 +-
 source/device/cpu/op/concat/concat_ref.c      |   19 +-
 source/device/cpu/op/conv/conv_kernel_ref.h   |    9 +-
 .../device/cpu/op/conv/conv_kernel_ref_fp16.c |   29 +-
 .../device/cpu/op/conv/conv_kernel_ref_fp32.c |   16 +-
 .../device/cpu/op/conv/conv_kernel_ref_int8.c |   30 +-
 .../cpu/op/conv/conv_kernel_ref_uint8.c       |   38 +-
 source/device/cpu/op/conv/conv_ref.c          |   29 +-
 .../armv8.2/conv_dw_kernel_fp16_arm82.c       |   16 +-
 .../armv8.2/conv_dw_kernel_fp16_arm82.h       |    3 +-
 .../cortex-a/armv8.2/conv_kernel_fp16_arm82.c |  224 +-
 .../cortex-a/conv_dw_dilation_kernel_arm.h    |   12 +-
 .../cpu/op/conv/cortex-a/conv_dw_hcl_arm.c    |   45 +-
 .../conv/cortex-a/conv_dw_k5_k7_kernel_arm.h  |  167 +-
 .../cpu/op/conv/cortex-a/conv_dw_kernel_arm.c |   17 +-
 .../cpu/op/conv/cortex-a/conv_dw_kernel_arm.h |   13 +-
 .../conv/cortex-a/conv_dw_kernel_int8_arm.c   |   69 +-
 .../conv/cortex-a/conv_dw_kernel_int8_arm.h   |    7 +-
 .../cpu/op/conv/cortex-a/conv_hcl_arm.c       |   61 +-
 .../cpu/op/conv/cortex-a/conv_kernel_arm.c    |   88 +-
 .../cpu/op/conv/cortex-a/conv_kernel_arm.h    |   23 +-
 .../op/conv/cortex-a/conv_kernel_int8_arm.c   |  790 ++--
 .../op/conv/cortex-a/conv_kernel_int8_arm.h   |    7 +-
 .../op/conv/cortex-a/wino_conv_kernel_1_arm.c |  424 +-
 .../op/conv/cortex-a/wino_conv_kernel_1_arm.h |    8 +-
 .../op/conv/cortex-a/wino_conv_kernel_arm.c   |  251 +-
 .../op/conv/cortex-a/wino_conv_kernel_arm.h   |    4 +-
 .../device/cpu/op/conv/cortex-m/conv_cmsis.c  |   32 +-
 .../cpu/op/conv/mips/conv_dw_hcl_mips.c       |   10 +-
 .../cpu/op/conv/mips/conv_dw_kernel_mips.c    |   53 +-
 .../cpu/op/conv/mips/conv_dw_kernel_mips.h    |    4 +-
 .../device/cpu/op/conv/mips/conv_hcl_mips.c   |   28 +-
 .../cpu/op/conv/mips/conv_kernel_mips.c       |   83 +-
 .../cpu/op/conv/mips/wino_conv_kernel_mips.c  |   79 +-
 .../cpu/op/conv/mips/wino_conv_kernel_mips.h  |    4 +-
 .../op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c  |   19 +-
 .../conv/risc-v/lp64dv/conv_dw_kernel_rv64.c  |   27 +-
 .../conv/risc-v/lp64dv/conv_dw_kernel_rv64.h  |    1 -
 .../cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c |   42 +-
 .../op/conv/risc-v/lp64dv/conv_kernel_rv64.c  |   97 +-
 .../op/conv/risc-v/lp64dv/conv_kernel_rv64.h  |    2 +-
 .../op/conv/x86/conv_direct_hcl_int8_x86.c    |   96 +-
 .../device/cpu/op/conv/x86/conv_dw_hcl_x86.c  |  105 +-
 .../cpu/op/conv/x86/conv_dw_kernel_x86.c      |   49 +-
 .../cpu/op/conv/x86/conv_dw_kernel_x86.h      |    1 -
 source/device/cpu/op/conv/x86/conv_hcl_x86.c  |   46 +-
 .../device/cpu/op/conv/x86/conv_kernel_x86.c  |  243 +-
 .../device/cpu/op/conv/x86/conv_kernel_x86.h  |    1 -
 .../cpu/op/conv/x86/wino_conv_kernel_x86.c    |   80 +-
 .../cpu/op/conv/x86/wino_conv_kernel_x86.h    |    1 -
 source/device/cpu/op/crop/crop_ref.c          |   31 +-
 .../op/deconv/cortex_a/deconv_dw_hcl_arm.c    |   11 +-
 .../op/deconv/cortex_a/deconv_dw_kernel_arm.c |   49 +-
 .../op/deconv/cortex_a/deconv_dw_kernel_arm.h |   15 +-
 .../cpu/op/deconv/cortex_a/deconv_hcl_arm.c   |   23 +-
 .../op/deconv/cortex_a/deconv_kernel_arm.c    |  273 +-
 .../op/deconv/cortex_a/deconv_kernel_arm.h    |   32 +-
 source/device/cpu/op/deconv/deconv_ref.c      |   72 +-
 .../cpu/op/depthtospace/depthtospace_ref.c    |    1 -
 .../detection_output/detection_output_ref.c   |   73 +-
 .../detection_postprocess_ref.c               |  160 +-
 source/device/cpu/op/dropout/dropout_ref.c    |    1 -
 .../cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c |    5 +-
 .../cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h |    1 -
 .../op/eltwise/cortex-a/eltwise_kernel_arm.c  |   83 +-
 .../op/eltwise/cortex-a/eltwise_kernel_arm.h  |    1 -
 source/device/cpu/op/eltwise/eltwise_ref.c    | 1032 ++---
 .../device/cpu/op/elu/cortex-a/elu_hcl_arm.c  |    3 +-
 .../cpu/op/elu/cortex-a/elu_kernel_arm.c      |    7 +-
 .../cpu/op/elu/cortex-a/elu_kernel_arm.h      |    1 -
 source/device/cpu/op/elu/elu_ref.c            |   14 +-
 .../device/cpu/op/embedding/embedding_ref.c   |   11 +-
 source/device/cpu/op/expand/expand_ref.c      |   61 +-
 .../device/cpu/op/expanddims/expanddims_ref.c |    1 -
 .../cortex-a/armv8.2/fc_kernel_fp16_arm82.c   |   95 +-
 .../cortex-a/armv8.2/fc_kernel_fp16_arm82.h   |   28 +-
 source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c |   40 +-
 .../device/cpu/op/fc/cortex-a/fc_kernel_arm.c |   21 +-
 .../device/cpu/op/fc/cortex-a/fc_kernel_arm.h |    1 -
 .../cpu/op/fc/cortex-a/fc_kernel_int8_arm.c   |  179 +-
 .../cpu/op/fc/cortex-a/fc_kernel_int8_arm.h   |   26 +-
 source/device/cpu/op/fc/cortex-m/fc_cmsis.c   |   12 +-
 source/device/cpu/op/fc/fc_ref.c              |   63 +-
 source/device/cpu/op/fc/x86/fc_hcl_x86.c      |   30 +-
 source/device/cpu/op/flatten/flatten_ref.c    |    1 -
 source/device/cpu/op/gather/gather_ref.c      |   38 +-
 source/device/cpu/op/gru/gru_ref.c            |   82 +-
 .../cpu/op/hardsigmoid/hardsigmoid_ref.c      |    5 +-
 .../cpu/op/hardswish/hardswish_kernel_ref.h   |    2 -
 .../op/hardswish/hardswish_kernel_ref_fp32.c  |    3 +-
 .../op/hardswish/hardswish_kernel_ref_uint8.c |    7 +-
 .../device/cpu/op/hardswish/hardswish_ref.c   |    4 +-
 source/device/cpu/op/input/input_ref.c        |    1 -
 .../cpu/op/instancenorm/instancenorm_ref.c    |   19 +-
 .../cpu/op/interp/cortex-a/interp_hcl_arm.c   |    3 +-
 .../op/interp/cortex-a/interp_kernel_arm.c    |   93 +-
 .../op/interp/cortex-a/interp_kernel_arm.h    |    1 -
 source/device/cpu/op/interp/interp_ref.c      |   68 +-
 .../op/l2normalization/l2normalization_ref.c  |    8 +-
 source/device/cpu/op/l2pool/l2pool_ref.c      |   46 +-
 source/device/cpu/op/logical/logical_ref.c    |   49 +-
 source/device/cpu/op/logistic/logistic_ref.c  |    7 +-
 .../device/cpu/op/logsoftmax/logsoftmax_ref.c |   48 +-
 .../device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c  |    3 +-
 .../cpu/op/lrn/cortex-a/lrn_kernel_arm.c      |   17 +-
 .../cpu/op/lrn/cortex-a/lrn_kernel_arm.h      |    1 -
 source/device/cpu/op/lrn/lrn_ref.c            |    7 +-
 source/device/cpu/op/lstm/lstm_ref.c          |  146 +-
 source/device/cpu/op/matmul/matmul_ref.c      |    3 +-
 source/device/cpu/op/maximum/maximum_ref.c    |   15 +-
 source/device/cpu/op/mean/mean_ref.c          |   17 +-
 source/device/cpu/op/minimum/minimum_ref.c    |   15 +-
 .../cpu/op/mish/cortex-a/mish_hcl_arm.c       |    5 +-
 .../cpu/op/mish/cortex-a/mish_kernel_arm.c    |    7 +-
 .../cpu/op/mish/cortex-a/mish_kernel_arm.h    |    1 -
 .../cpu/op/mish/cortex-a/mish_math_func.h     |    1 -
 source/device/cpu/op/mish/mish_kernel_ref.h   |    6 +-
 .../device/cpu/op/mish/mish_kernel_ref_fp32.c |    1 -
 .../cpu/op/mish/mish_kernel_ref_uint8.c       |   10 +-
 source/device/cpu/op/mish/mish_ref.c          |    7 +-
 source/device/cpu/op/mvn/mvn_ref.c            |    7 +-
 source/device/cpu/op/noop/noop_ref.c          |   38 +-
 .../device/cpu/op/normalize/normalize_ref.c   |   17 +-
 source/device/cpu/op/pad/pad_ref.c            |   26 +-
 source/device/cpu/op/permute/permute_ref.c    |   15 +-
 .../cpu/op/pooling/cortex-a/pooling_hcl_arm.c |   17 +-
 .../cpu/op/pooling/cortex-a/pooling_hcl_arm.h |   45 +-
 .../pooling/cortex-a/pooling_hcl_arm_int8.h   |  251 +-
 .../cpu/op/pooling/cortex-m/pooling_cmsis.c   |    2 +-
 .../cpu/op/pooling/pooling_kernel_ref.h       |    9 +-
 .../cpu/op/pooling/pooling_kernel_ref_fp16.c  |   44 +-
 .../cpu/op/pooling/pooling_kernel_ref_fp32.c  |   10 +-
 .../cpu/op/pooling/pooling_kernel_ref_int8.c  |   31 +-
 .../cpu/op/pooling/pooling_kernel_ref_uint8.c |   17 +-
 source/device/cpu/op/pooling/pooling_ref.c    |   28 +-
 .../cpu/op/prelu/cortex_a/prelu_hcl_arm.c     |    4 +-
 .../cpu/op/prelu/cortex_a/prelu_kernel_arm.c  |    1 -
 source/device/cpu/op/prelu/prelu_ref.c        |   28 +-
 source/device/cpu/op/priorbox/priorbox_ref.c  |   23 +-
 .../cpu/op/psroipooling/psroipooling_ref.c    |   17 +-
 .../device/cpu/op/reciprocal/reciprocal_ref.c |   18 +-
 source/device/cpu/op/reducel2/reducel2_ref.c  |    7 +-
 .../cpu/op/reduction/reduction_kernel_ref.h   |  910 ++---
 .../device/cpu/op/reduction/reduction_ref.c   |   16 +-
 source/device/cpu/op/region/region_ref.c      |    3 +-
 .../cpu/op/relu/cortex-a/relu_hcl_arm.c       |    3 +-
 .../cpu/op/relu/cortex-a/relu_hcl_arm.h       |    5 +-
 .../cpu/op/relu/cortex-a/relu_kernel_arm.c    |    7 +-
 .../cpu/op/relu/cortex-a/relu_kernel_arm.h    |    1 -
 .../device/cpu/op/relu/cortex-m/relu_cmsis.c  |    1 -
 source/device/cpu/op/relu/relu_kernel_ref.h   |    2 -
 .../device/cpu/op/relu/relu_kernel_ref_fp16.c |    5 +-
 .../device/cpu/op/relu/relu_kernel_ref_fp32.c |    1 -
 .../device/cpu/op/relu/relu_kernel_ref_int8.c |    7 +-
 .../cpu/op/relu/relu_kernel_ref_uint8.c       |    7 +-
 source/device/cpu/op/relu/relu_ref.c          |    9 +-
 source/device/cpu/op/relu1/relu1_ref.c        |    1 -
 source/device/cpu/op/relu6/relu6_ref.c        |   18 +-
 source/device/cpu/op/reorg/reorg_ref.c        |    3 +-
 source/device/cpu/op/reshape/reshape_ref.c    |   52 +-
 source/device/cpu/op/resize/resize_ref.c      |   14 +-
 source/device/cpu/op/reverse/reverse_ref.c    |   31 +-
 source/device/cpu/op/rnn/rnn_ref.c            |   19 +-
 source/device/cpu/op/roialign/roialign_ref.c  |   21 +-
 .../device/cpu/op/roipooling/roipooling_ref.c |   17 +-
 source/device/cpu/op/round/round_ref.c        |    1 -
 source/device/cpu/op/rpn/rpn_ref.c            |   37 +-
 source/device/cpu/op/scale/scale_ref.c        |    3 +-
 source/device/cpu/op/scatter/scatter_ref.c    |  276 +-
 .../cpu/op/selu/cortex-a/selu_hcl_arm.c       |    3 +-
 .../cpu/op/selu/cortex-a/selu_kernel_arm.c    |    7 +-
 .../cpu/op/selu/cortex-a/selu_kernel_arm.h    |    1 -
 source/device/cpu/op/selu/selu_ref.c          |   27 +-
 source/device/cpu/op/shape/shape_ref.c        |    4 +-
 .../op/shuffle_channel/shuffle_channel_ref.c  |   11 +-
 .../cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c |    1 -
 .../op/sigmoid/cortex-a/sigmoid_kernel_arm.c  |   23 +-
 .../op/sigmoid/cortex-a/sigmoid_kernel_arm.h  |    1 -
 source/device/cpu/op/sigmoid/sigmoid_ref.c    |   35 +-
 source/device/cpu/op/slice/slice_ref.c        |   67 +-
 .../cpu/op/softmax/cortex-a/softmax_hcl_arm.c |   34 +-
 .../cpu/op/softmax/cortex-m/softmax_cmsis.c   |    6 +-
 .../cpu/op/softmax/softmax_kernel_ref.h       |   14 +-
 .../cpu/op/softmax/softmax_kernel_ref_fp32.c  |    7 +-
 .../cpu/op/softmax/softmax_kernel_ref_int8.c  |   11 +-
 .../cpu/op/softmax/softmax_kernel_ref_uint8.c |   11 +-
 source/device/cpu/op/softmax/softmax_ref.c    |   21 +-
 source/device/cpu/op/softplus/softplus_ref.c  |   19 +-
 .../op/spacetobatchnd/spacetobatchnd_ref.c    |   39 +-
 .../cpu/op/spacetodepth/spacetodepth_ref.c    |    4 +-
 .../cpu/op/sparsetodense/sparsetodense_ref.c  |    3 +-
 .../spatialtransformer_ref.c                  |  135 +-
 source/device/cpu/op/split/split_ref.c        |   19 +-
 .../squareddifference/squareddifference_ref.c |   15 +-
 source/device/cpu/op/squeeze/squeeze_ref.c    |    5 +-
 .../cpu/op/strided_slice/strided_slice_ref.c  |   17 +-
 .../device/cpu/op/swap_axis/swap_axis_ref.c   |   27 +-
 .../cpu/op/tanh/cortex-a/tanh_hcl_arm.c       |    5 +-
 .../cpu/op/tanh/cortex-a/tanh_kernel_arm.c    |    9 +-
 source/device/cpu/op/tanh/tanh_ref.c          |   11 +-
 .../device/cpu/op/threshold/threshold_ref.c   |   15 +-
 source/device/cpu/op/tile/tile_ref.c          |   68 +-
 source/device/cpu/op/topkv2/topkv2_ref.c      |   27 +-
 .../device/cpu/op/transpose/transpose_ref.c   |  148 +-
 source/device/cpu/op/unary/unary_kernel_ref.h |    1 -
 .../cpu/op/unary/unary_kernel_ref_fp32.c      |  209 +-
 .../cpu/op/unary/unary_kernel_ref_uint8.c     |  215 +-
 source/device/cpu/op/unary/unary_ref.c        |    9 +-
 .../device/cpu/op/unsqueeze/unsqueeze_ref.c   |    5 +-
 source/device/cpu/op/upsample/upsample_ref.c  |    9 +-
 source/device/cpu/op/where/where_ref.c        |   15 +-
 .../device/cpu/op/zeroslike/zeroslike_ref.c   |    3 +-
 source/device/cuda/cuda_device.hpp            |    3 +-
 source/device/cuda/cuda_executor.hpp          |    7 +-
 source/device/cuda/cuda_graph.hpp             |    4 +-
 source/device/cuda/cuda_limit.hpp             |   35 +-
 source/device/device.c                        |    4 +-
 source/device/device.h                        |   20 +-
 source/device/opencl/ocl_define.h             |    3 +-
 source/device/opencl/ocl_device.hpp           |    3 +-
 source/device/opencl/ocl_executor.hpp         |   28 +-
 source/device/opencl/ocl_graph.hpp            |    4 +-
 source/device/opencl/ocl_helper.hpp           |   10 +-
 source/device/opencl/ocl_limit.hpp            |  245 +-
 source/device/tensorrt/trt_define.h           |   13 +-
 source/device/tensorrt/trt_device.hpp         |    2 -
 source/device/tensorrt/trt_executor.hpp       |    5 +-
 source/device/tensorrt/trt_graph.hpp          |    1 -
 source/device/tensorrt/trt_helper.hpp         |   97 +-
 source/device/tensorrt/trt_limit.hpp          |  159 +-
 source/device/tim-vx/timvx_device.hpp         |    3 +-
 source/device/tim-vx/timvx_dump.c             | 1119 +++--
 source/device/tim-vx/timvx_dump.h             |   68 +-
 source/device/tim-vx/timvx_executor.hpp       |   34 +-
 source/device/tim-vx/timvx_graph.hpp          |    4 +-
 source/device/tim-vx/timvx_limit.hpp          |  138 +-
 source/device/vulkan/layer/concat_vulkan.cpp  |   63 +-
 source/device/vulkan/layer/concat_vulkan.hpp  |    6 +-
 .../vulkan/layer/convolution_vulkan.cpp       |  114 +-
 .../vulkan/layer/convolution_vulkan.hpp       |   10 +-
 .../layer/convolutiondepthwise_vulkan.cpp     |  157 +-
 .../layer/convolutiondepthwise_vulkan.hpp     |    9 +-
 source/device/vulkan/layer/crop_vulkan.cpp    |   92 +-
 source/device/vulkan/layer/crop_vulkan.hpp    |    8 +-
 source/device/vulkan/layer/dropout_vulkan.cpp |   29 +-
 source/device/vulkan/layer/dropout_vulkan.hpp |    7 +-
 source/device/vulkan/layer/eltwise_vulkan.cpp |   49 +-
 source/device/vulkan/layer/eltwise_vulkan.hpp |    8 +-
 source/device/vulkan/layer/flatten_vulkan.cpp |   50 +-
 source/device/vulkan/layer/flatten_vulkan.hpp |    7 +-
 .../vulkan/layer/innerproduct_vulkan.cpp      |   59 +-
 .../vulkan/layer/innerproduct_vulkan.hpp      |    6 +-
 source/device/vulkan/layer/interp_vulkan.cpp  |   86 +-
 source/device/vulkan/layer/interp_vulkan.hpp  |   10 +-
 source/device/vulkan/layer/packing_vulkan.cpp |    9 +-
 source/device/vulkan/layer/packing_vulkan.hpp |    3 +-
 source/device/vulkan/layer/padding_vulkan.cpp |   44 +-
 source/device/vulkan/layer/padding_vulkan.hpp |    5 +-
 source/device/vulkan/layer/permute_vulkan.cpp |   61 +-
 source/device/vulkan/layer/permute_vulkan.hpp |    6 +-
 source/device/vulkan/layer/pooling_vulkan.cpp |   67 +-
 source/device/vulkan/layer/pooling_vulkan.hpp |   23 +-
 .../device/vulkan/layer/priorbox_vulkan.cpp   |   41 +-
 .../device/vulkan/layer/priorbox_vulkan.hpp   |    6 +-
 source/device/vulkan/layer/relu_vulkan.cpp    |   38 +-
 source/device/vulkan/layer/relu_vulkan.hpp    |    6 +-
 source/device/vulkan/layer/reshape_vulkan.cpp |   79 +-
 source/device/vulkan/layer/reshape_vulkan.hpp |    7 +-
 source/device/vulkan/layer/softmax_vulkan.cpp |   63 +-
 source/device/vulkan/layer/softmax_vulkan.hpp |    7 +-
 source/device/vulkan/layer_shader_type.h      |    2 +-
 source/device/vulkan/vulkan_allocator.cpp     |  200 +-
 source/device/vulkan/vulkan_allocator.hpp     |   53 +-
 source/device/vulkan/vulkan_command.cpp       |  376 +-
 source/device/vulkan/vulkan_command.hpp       |  113 +-
 source/device/vulkan/vulkan_define.h          |    3 +-
 source/device/vulkan/vulkan_device.hpp        |    3 +-
 source/device/vulkan/vulkan_executor.hpp      |   19 +-
 source/device/vulkan/vulkan_gpu.cpp           |  340 +-
 source/device/vulkan/vulkan_gpu.hpp           |   28 +-
 source/device/vulkan/vulkan_graph.hpp         |   32 +-
 source/device/vulkan/vulkan_helper.hpp        |    6 +-
 source/device/vulkan/vulkan_layer.cpp         |    2 +-
 source/device/vulkan/vulkan_layer.hpp         |    5 +-
 source/device/vulkan/vulkan_limit.hpp         |  245 +-
 source/device/vulkan/vulkan_option.cpp        |    2 +-
 source/device/vulkan/vulkan_pipeline.cpp      |   53 +-
 source/device/vulkan/vulkan_pipeline.hpp      |    2 +-
 source/device/vulkan/vulkan_platform.hpp      |   74 +-
 source/device/vulkan/vulkan_tensor.cpp        |   16 +-
 source/device/vulkan/vulkan_tensor.hpp        |   50 +-
 source/executer/executer.c                    |   28 +-
 source/executer/executer.h                    |   33 +-
 source/graph/graph.c                          |   46 +-
 source/graph/graph.h                          |   47 +-
 source/graph/node.c                           |   44 +-
 source/graph/node.h                           |   31 +-
 source/graph/subgraph.c                       |   28 +-
 source/graph/subgraph.h                       |   29 +-
 source/graph/tensor.c                         |   54 +-
 source/graph/tensor.h                         |   67 +-
 source/module/module.c                        |   45 +-
 source/module/module.h                        |   20 -
 source/operator/op.c                          |    2 -
 source/operator/op.h                          |   27 +-
 source/operator/op_name.h                     |  204 +-
 source/operator/prototype/absval.c            |    6 -
 source/operator/prototype/add_n.c             |    4 -
 source/operator/prototype/argmax.c            |   11 +-
 source/operator/prototype/argmin.c            |   13 +-
 source/operator/prototype/batchnorm.c         |    7 +-
 source/operator/prototype/batchtospacend.c    |   17 +-
 source/operator/prototype/bias.c              |    9 +-
 source/operator/prototype/broadmul.c          |    9 +-
 source/operator/prototype/cast.c              |    4 -
 source/operator/prototype/ceil.c              |    8 +-
 source/operator/prototype/clip.c              |   12 +-
 source/operator/prototype/comparison.c        |    9 +-
 source/operator/prototype/concat.c            |    9 +-
 source/operator/prototype/const.c             |   10 +-
 source/operator/prototype/convolution.c       |   17 +-
 source/operator/prototype/convolution_param.h |   22 +-
 source/operator/prototype/crop.c              |   10 +-
 source/operator/prototype/deconvolution.c     |    6 +-
 source/operator/prototype/depthtospace.c      |   19 +-
 source/operator/prototype/detection_output.c  |   11 +-
 .../prototype/detection_postprocess.c         |   17 +-
 .../prototype/detection_postprocess_param.h   |    2 +-
 source/operator/prototype/dropout.c           |    8 +-
 source/operator/prototype/eltwise.c           |   10 +-
 source/operator/prototype/elu.c               |    7 +-
 source/operator/prototype/embedding.c         |   10 +-
 source/operator/prototype/embedding_param.h   |    2 +-
 source/operator/prototype/expand.c            |   76 +-
 source/operator/prototype/expanddims.c        |    9 +-
 source/operator/prototype/fc.c                |    3 +-
 source/operator/prototype/flatten.c           |   10 +-
 source/operator/prototype/gather.c            |   50 +-
 source/operator/prototype/gemm.c              |   11 +-
 source/operator/prototype/generic.c           |    7 +-
 source/operator/prototype/generic_param.h     |    2 +-
 source/operator/prototype/gru.c               |   10 +-
 source/operator/prototype/gru_param.h         |    2 +-
 source/operator/prototype/hardsigmoid.c       |    7 +-
 source/operator/prototype/hardswish.c         |    6 +-
 source/operator/prototype/input.c             |   10 +-
 source/operator/prototype/instancenorm.c      |    7 +-
 source/operator/prototype/interp.c            |   13 +-
 source/operator/prototype/l2normalization.c   |    9 +-
 source/operator/prototype/l2pool.c            |   29 +-
 source/operator/prototype/l2pool_param.h      |    3 +-
 source/operator/prototype/logical.c           |    7 +-
 source/operator/prototype/logsoftmax.c        |    9 +-
 source/operator/prototype/lrn.c               |    8 +-
 source/operator/prototype/lstm.c              |    9 +-
 source/operator/prototype/lstm_param.h        |    2 +-
 source/operator/prototype/matmul.c            |    5 -
 source/operator/prototype/maximum.c           |    3 -
 source/operator/prototype/mean.c              |    3 -
 source/operator/prototype/minimum.c           |    3 -
 source/operator/prototype/mish.c              |    9 +-
 source/operator/prototype/mvn.c               |    6 +-
 source/operator/prototype/noop.c              |    4 -
 source/operator/prototype/normalize.c         |    6 +-
 source/operator/prototype/pad.c               |   20 +-
 source/operator/prototype/pad_param.h         |   10 +-
 source/operator/prototype/permute.c           |    9 +-
 source/operator/prototype/pooling.c           |   13 +-
 source/operator/prototype/pooling_param.h     |    4 +-
 source/operator/prototype/prelu.c             |   10 +-
 source/operator/prototype/priorbox.c          |   12 +-
 source/operator/prototype/psroipooling.c      |    7 +-
 source/operator/prototype/reciprocal.c        |    4 +-
 source/operator/prototype/reducel2.c          |   13 +-
 source/operator/prototype/reduction.c         |   18 +-
 source/operator/prototype/region.c            |    6 +-
 source/operator/prototype/relu.c              |    7 +-
 source/operator/prototype/relu1.c             |   10 +-
 source/operator/prototype/relu6.c             |    9 +-
 source/operator/prototype/reorg.c             |    9 +-
 source/operator/prototype/reshape.c           |   42 +-
 source/operator/prototype/resize.c            |   17 +-
 source/operator/prototype/resize_param.h      |    2 +-
 source/operator/prototype/reverse.c           |    5 -
 source/operator/prototype/rnn.c               |    9 +-
 source/operator/prototype/roialign.c          |   10 +-
 source/operator/prototype/roipooling.c        |    8 +-
 source/operator/prototype/round.c             |    5 -
 source/operator/prototype/rpn.c               |   32 +-
 source/operator/prototype/scale.c             |    7 +-
 source/operator/prototype/scatter.c           |    8 +-
 source/operator/prototype/selu.c              |    7 +-
 source/operator/prototype/shape.c             |    9 +-
 source/operator/prototype/shuffle_channel.c   |    8 +-
 source/operator/prototype/sigmoid.c           |    5 -
 source/operator/prototype/slice.c             |   26 +-
 source/operator/prototype/slice_param.h       |    1 -
 source/operator/prototype/softmax.c           |    7 +-
 source/operator/prototype/softplus.c          |    5 +-
 source/operator/prototype/spacetobatchnd.c    |   14 +-
 source/operator/prototype/spacetodepth.c      |   19 +-
 source/operator/prototype/sparsetodense.c     |    7 +-
 .../operator/prototype/spatialtransformer.c   |   27 +-
 source/operator/prototype/split.c             |   15 +-
 source/operator/prototype/squareddifference.c |    9 +-
 source/operator/prototype/squeeze.c           |   13 +-
 source/operator/prototype/strided_slice.c     |   32 +-
 source/operator/prototype/swap_axis.c         |   11 +-
 source/operator/prototype/tanh.c              |    7 +-
 source/operator/prototype/threshold.c         |    7 +-
 source/operator/prototype/tile.c              |   89 +-
 source/operator/prototype/topkv2.c            |   10 +-
 source/operator/prototype/transpose.c         |   14 +-
 source/operator/prototype/unary.c             |    8 +-
 source/operator/prototype/unsqueeze.c         |   12 +-
 source/operator/prototype/upsample.c          |    9 +-
 source/operator/prototype/where.c             |    9 +-
 source/operator/prototype/zeroslike.c         |    9 +-
 source/optimizer/estimation.c                 |   11 +-
 source/optimizer/estimation.h                 |   15 +-
 source/optimizer/helper.c                     |    4 -
 source/optimizer/helper.h                     |    1 -
 source/optimizer/split.c                      |   13 +-
 source/optimizer/split.h                      |    1 -
 source/scheduler/scheduler.c                  |   22 +-
 source/scheduler/scheduler.h                  |   10 +-
 source/serializer/serializer.c                |    1 -
 source/serializer/serializer.h                |    2 -
 source/serializer/tmfile/op/tm2_add_n.c       |    3 -
 source/serializer/tmfile/op/tm2_argmax.c      |    6 +-
 source/serializer/tmfile/op/tm2_argmin.c      |    8 +-
 source/serializer/tmfile/op/tm2_batchnorm.c   |    8 +-
 .../serializer/tmfile/op/tm2_batchtospacend.c |    8 +-
 source/serializer/tmfile/op/tm2_bias.c        |    4 -
 source/serializer/tmfile/op/tm2_broadmul.c    |    4 -
 source/serializer/tmfile/op/tm2_cast.c        |    8 +-
 source/serializer/tmfile/op/tm2_ceil.c        |    4 -
 source/serializer/tmfile/op/tm2_clip.c        |    8 +-
 source/serializer/tmfile/op/tm2_comparison.c  |    8 +-
 source/serializer/tmfile/op/tm2_concat.c      |    8 +-
 source/serializer/tmfile/op/tm2_conv.c        |    8 +-
 source/serializer/tmfile/op/tm2_crop.c        |    8 +-
 source/serializer/tmfile/op/tm2_deconv.c      |   16 +-
 .../serializer/tmfile/op/tm2_depthtospace.c   |    8 +-
 .../tmfile/op/tm2_detection_output.c          |    8 +-
 .../tmfile/op/tm2_detection_postprocess.c     |   12 +-
 source/serializer/tmfile/op/tm2_dropout.c     |    4 -
 source/serializer/tmfile/op/tm2_eltwise.c     |    8 +-
 source/serializer/tmfile/op/tm2_elu.c         |    8 +-
 source/serializer/tmfile/op/tm2_embedding.c   |    8 +-
 source/serializer/tmfile/op/tm2_expand.c      |   15 +-
 source/serializer/tmfile/op/tm2_expanddims.c  |    8 +-
 source/serializer/tmfile/op/tm2_fc.c          |    8 +-
 source/serializer/tmfile/op/tm2_flatten.c     |    8 +-
 source/serializer/tmfile/op/tm2_gather.c      |   14 +-
 source/serializer/tmfile/op/tm2_gemm.c        |    8 +-
 source/serializer/tmfile/op/tm2_generic.c     |   10 +-
 source/serializer/tmfile/op/tm2_gru.c         |    8 +-
 source/serializer/tmfile/op/tm2_hardsigmoid.c |    8 +-
 source/serializer/tmfile/op/tm2_hardswish.c   |    8 +-
 .../serializer/tmfile/op/tm2_instancenorm.c   |    8 +-
 source/serializer/tmfile/op/tm2_interp.c      |    8 +-
 .../tmfile/op/tm2_l2normalization.c           |    4 -
 source/serializer/tmfile/op/tm2_l2pool.c      |    6 +-
 source/serializer/tmfile/op/tm2_logical.c     |    8 +-
 source/serializer/tmfile/op/tm2_logistic.c    |    4 -
 source/serializer/tmfile/op/tm2_logsoftmax.c  |    4 -
 source/serializer/tmfile/op/tm2_lrn.c         |    8 +-
 source/serializer/tmfile/op/tm2_lstm.c        |    8 +-
 source/serializer/tmfile/op/tm2_matmul.c      |    4 -
 source/serializer/tmfile/op/tm2_maximum.c     |    6 +-
 source/serializer/tmfile/op/tm2_mean.c        |    4 -
 source/serializer/tmfile/op/tm2_mish.c        |    4 -
 source/serializer/tmfile/op/tm2_mvn.c         |    8 +-
 source/serializer/tmfile/op/tm2_noop.c        |    4 -
 source/serializer/tmfile/op/tm2_normalize.c   |    8 +-
 source/serializer/tmfile/op/tm2_pad.c         |    8 +-
 source/serializer/tmfile/op/tm2_permute.c     |    8 +-
 source/serializer/tmfile/op/tm2_pool.c        |    7 +-
 source/serializer/tmfile/op/tm2_prelu.c       |    4 -
 source/serializer/tmfile/op/tm2_priorbox.c    |    5 -
 .../serializer/tmfile/op/tm2_psroipooling.c   |    8 +-
 source/serializer/tmfile/op/tm2_reciprocal.c  |    2 +-
 source/serializer/tmfile/op/tm2_reducel2.c    |    8 +-
 source/serializer/tmfile/op/tm2_reduction.c   |    8 +-
 source/serializer/tmfile/op/tm2_region.c      |    8 +-
 source/serializer/tmfile/op/tm2_relu.c        |    8 +-
 source/serializer/tmfile/op/tm2_relu1.c       |    4 -
 source/serializer/tmfile/op/tm2_relu6.c       |    4 -
 source/serializer/tmfile/op/tm2_reorg.c       |    8 +-
 source/serializer/tmfile/op/tm2_reshape.c     |   12 +-
 source/serializer/tmfile/op/tm2_resize.c      |    8 +-
 source/serializer/tmfile/op/tm2_reverse.c     |    4 -
 source/serializer/tmfile/op/tm2_rnn.c         |   16 +-
 source/serializer/tmfile/op/tm2_roialign.c    |    8 +-
 source/serializer/tmfile/op/tm2_roipooling.c  |    8 +-
 source/serializer/tmfile/op/tm2_round.c       |    6 +-
 source/serializer/tmfile/op/tm2_rpn.c         |   12 +-
 source/serializer/tmfile/op/tm2_scale.c       |    8 +-
 source/serializer/tmfile/op/tm2_scatter.c     |    6 +-
 source/serializer/tmfile/op/tm2_selu.c        |    8 +-
 source/serializer/tmfile/op/tm2_shape.c       |    4 -
 .../tmfile/op/tm2_shuffle_channel.c           |    8 +-
 source/serializer/tmfile/op/tm2_sigmoid.c     |    4 -
 source/serializer/tmfile/op/tm2_slice.c       |   18 +-
 source/serializer/tmfile/op/tm2_softmax.c     |    8 +-
 source/serializer/tmfile/op/tm2_softplus.c    |    2 +-
 .../serializer/tmfile/op/tm2_spacetobatchnd.c |    8 +-
 .../serializer/tmfile/op/tm2_spacetodepth.c   |    4 -
 .../serializer/tmfile/op/tm2_sparsetodense.c  |    8 +-
 .../tmfile/op/tm2_spatialtransformer.c        |   14 +-
 source/serializer/tmfile/op/tm2_split.c       |   12 +-
 .../tmfile/op/tm2_squareddifference.c         |    4 -
 source/serializer/tmfile/op/tm2_squeeze.c     |    8 +-
 .../serializer/tmfile/op/tm2_strided_slice.c  |    8 +-
 source/serializer/tmfile/op/tm2_swap_axis.c   |    8 +-
 source/serializer/tmfile/op/tm2_tanh.c        |    4 -
 source/serializer/tmfile/op/tm2_threshold.c   |    8 +-
 source/serializer/tmfile/op/tm2_tile.c        |   10 +-
 source/serializer/tmfile/op/tm2_topkv2.c      |    8 +-
 source/serializer/tmfile/op/tm2_transpose.c   |    6 +-
 source/serializer/tmfile/op/tm2_unary.c       |    8 +-
 source/serializer/tmfile/op/tm2_unsqueeze.c   |   12 +-
 source/serializer/tmfile/op/tm2_upsample.c    |    8 +-
 source/serializer/tmfile/op/tm2_where.c       |    4 -
 source/serializer/tmfile/op/tm2_zeroslike.c   |    4 -
 source/serializer/tmfile/tm2_format.h         |  521 ++-
 source/serializer/tmfile/tm2_serializer.c     |  173 +-
 source/serializer/tmfile/tm2_serializer.h     |    9 +-
 source/system/cpu.c                           |   40 +-
 source/utility/float.c                        |   60 +-
 source/utility/float.h                        |   56 +-
 source/utility/lock.c                         |   30 +-
 source/utility/lock.h                         |   15 +-
 source/utility/log.c                          |   78 +-
 source/utility/log.h                          |  124 +-
 source/utility/math.c                         |    9 -
 source/utility/math.h                         |    8 -
 source/utility/mem_stat.c                     |    6 +-
 source/utility/sys_port.c                     |    2 +-
 source/utility/sys_port.h                     |    4 +-
 source/utility/utils.c                        |  126 +-
 source/utility/utils.h                        |   11 -
 source/utility/vector.c                       |   25 +-
 source/utility/vector.h                       |   27 +-
 tests/common/common.h                         |   10 +-
 tests/common/compiler_fp16.h                  |   20 +-
 tests/common/stb_image.h                      | 3627 ++++++++---------
 tests/common/stb_image_write.h                |  787 ++--
 tests/common/tengine_operations.c             |  136 +-
 tests/common/util/mathp.c                     |    9 -
 tests/common/util/mathp.h                     |    8 -
 tests/common/util/vector.c                    |   25 +-
 tests/common/util/vector.h                    |   28 +-
 tests/models/test_model_alphapose.cpp         |   57 +-
 tests/models/test_model_classification.cpp    |  100 +-
 tests/models/test_model_common.cpp            |   48 +-
 tests/models/test_model_crnn.cpp              |   44 +-
 tests/models/test_model_efficientdet.c        |   99 +-
 tests/models/test_model_hrnet.cpp             |   63 +-
 tests/models/test_model_landmark.cpp          |   48 +-
 tests/models/test_model_mobilefacenet.cpp     |   38 +-
 tests/models/test_model_mobilenet_ssd.c       |   54 +-
 tests/models/test_model_nanodet_m.cpp         |  113 +-
 tests/models/test_model_openpose.cpp          |   45 +-
 tests/models/test_model_retinaface.cpp        |   77 +-
 tests/models/test_model_ultraface.cpp         |   61 +-
 tests/models/test_model_unet.cpp              |  105 +-
 tests/models/test_model_yolact.cpp            |   57 +-
 tests/models/test_model_yolofastest.cpp       |   41 +-
 tests/models/test_model_yolov3.cpp            |   28 +-
 tests/models/test_model_yolov3_tiny.cpp       |   62 +-
 tests/models/test_model_yolov4.cpp            |   74 +-
 tests/models/test_model_yolov4_tiny.cpp       |   61 +-
 tests/models/test_model_yolov5s.cpp           |   53 +-
 tests/models/test_timvx_model_yolov5s.cpp     |  125 +-
 tests/op/test_onnx_op.h                       |    7 +-
 tests/op/test_onnx_op_abs.cpp                 |   11 +-
 tests/op/test_onnx_op_acos.cpp                |   11 +-
 tests/op/test_onnx_op_add.cpp                 |   13 +-
 tests/op/test_onnx_op_asin.cpp                |   11 +-
 tests/op/test_onnx_op_atan.cpp                |   11 +-
 .../test_onnx_op_averagepool_2d_default.cpp   |   11 +-
 tests/op/test_onnx_op_averagepool_2d_pads.cpp |   11 +-
 .../test_onnx_op_basic_conv_with_padding.cpp  |   13 +-
 ...est_onnx_op_basic_conv_without_padding.cpp |   13 +-
 tests/op/test_onnx_op_ceil.cpp                |   11 +-
 tests/op/test_onnx_op_clip_example.cpp        |   17 +-
 tests/op/test_onnx_op_concat_1d_axis_0.cpp    |   15 +-
 tests/op/test_onnx_op_concat_2d_axis_0.cpp    |   15 +-
 tests/op/test_onnx_op_concat_2d_axis_1.cpp    |   15 +-
 tests/op/test_onnx_op_concat_3d_axis_0.cpp    |   13 +-
 tests/op/test_onnx_op_concat_3d_axis_1.cpp    |   13 +-
 tests/op/test_onnx_op_concat_3d_axis_2.cpp    |   13 +-
 ...t_onnx_op_conv_with_strides_no_padding.cpp |   13 +-
 ...test_onnx_op_conv_with_strides_padding.cpp |   13 +-
 tests/op/test_onnx_op_convtranspose.cpp       |   13 +-
 .../test_onnx_op_convtranspose_dilations.cpp  |   13 +-
 tests/op/test_onnx_op_convtranspose_pad.cpp   |   13 +-
 tests/op/test_onnx_op_convtranspose_pads.cpp  |   13 +-
 tests/op/test_onnx_op_cos.cpp                 |   11 +-
 .../op/test_onnx_op_depthtospace_dcr_mode.cpp |   11 +-
 tests/op/test_onnx_op_div.cpp                 |   13 +-
 tests/op/test_onnx_op_dropout_default.cpp     |   11 +-
 tests/op/test_onnx_op_elu.cpp                 |   11 +-
 tests/op/test_onnx_op_equal.cpp               |   13 +-
 tests/op/test_onnx_op_exp.cpp                 |   11 +-
 .../op/test_onnx_op_expand_dim_unchanged.cpp  |   14 +-
 tests/op/test_onnx_op_floor.cpp               |   11 +-
 tests/op/test_onnx_op_globalaveragepool.cpp   |   11 +-
 tests/op/test_onnx_op_greater.cpp             |   11 +-
 tests/op/test_onnx_op_gru_defaults.cpp        |   17 +-
 tests/op/test_onnx_op_gru_seq_length.cpp      |   23 +-
 .../op/test_onnx_op_gru_with_initial_bias.cpp |   23 +-
 tests/op/test_onnx_op_hardsigmoid.cpp         |   11 +-
 .../op/test_onnx_op_instancenorm_epsilon.cpp  |   27 +-
 .../op/test_onnx_op_instancenorm_example.cpp  |   27 +-
 tests/op/test_onnx_op_leakyrelu.cpp           |   11 +-
 tests/op/test_onnx_op_less.cpp                |   13 +-
 tests/op/test_onnx_op_log.cpp                 |   11 +-
 .../test_onnx_op_logsoftmax_default_axis.cpp  |   11 +-
 tests/op/test_onnx_op_lstm_defaults.cpp       |   17 +-
 .../test_onnx_op_lstm_with_initial_bias.cpp   |   23 +-
 tests/op/test_onnx_op_matmul_2d.cpp           |   13 +-
 tests/op/test_onnx_op_matmul_3d.cpp           |   13 +-
 tests/op/test_onnx_op_matmul_4d.cpp           |   13 +-
 tests/op/test_onnx_op_maxpool_2d_default.cpp  |   11 +-
 .../op/test_onnx_op_maxpool_2d_dilations.cpp  |   11 +-
 tests/op/test_onnx_op_maxpool_2d_pads.cpp     |   11 +-
 tests/op/test_onnx_op_neg.cpp                 |   11 +-
 tests/op/test_onnx_op_pow.cpp                 |   13 +-
 tests/op/test_onnx_op_reciprocal.cpp          |   11 +-
 .../test_onnx_op_reduce_log_sum_default.cpp   |   11 +-
 ...educe_max_default_axes_keepdim_example.cpp |   11 +-
 ...uce_mean_default_axes_keepdims_example.cpp |   11 +-
 ...duce_min_default_axes_keepdims_example.cpp |   11 +-
 ...m_square_default_axes_keepdims_example.cpp |   11 +-
 tests/op/test_onnx_op_relu.cpp                |   11 +-
 tests/op/test_onnx_op_selu.cpp                |   11 +-
 tests/op/test_onnx_op_selu_default.cpp        |   11 +-
 .../op/test_onnx_op_softmax_default_axis.cpp  |   11 +-
 tests/op/test_onnx_op_softplus.cpp            |   11 +-
 tests/op/test_onnx_op_squeeze.cpp             |   11 +-
 tests/op/test_onnx_op_sub.cpp                 |   13 +-
 tests/op/test_onnx_op_tanh.cpp                |   11 +-
 tests/op/test_onnx_op_unsqueeze_axis_1.cpp    |   11 +-
 tests/op/test_op.h                            |  499 ++-
 tests/op/test_op_conv.c                       |   70 +-
 tests/op/test_op_prelu.c                      |   17 +-
 tests/op/test_op_relu.c                       |   14 +-
 tests/op/test_op_relu6.c                      |   14 +-
 tests/op/test_tensorrt_op_clip.cpp            |  239 +-
 tests/op/test_tensorrt_op_concat.cpp          |  338 +-
 tests/op/test_tensorrt_op_deconv.cpp          |  469 ++-
 tests/op/test_tensorrt_op_dropout.cpp         |  277 +-
 tests/op/test_tensorrt_op_eltwise.cpp         |  343 +-
 tests/op/test_tensorrt_op_fc.cpp              |  315 +-
 tests/op/test_timvx_op_clip.cpp               |   22 +-
 tests/op/test_timvx_op_concat.cpp             |  412 +-
 tests/op/test_timvx_op_convolution.cpp        |   41 +-
 tests/op/test_timvx_op_deconv.cpp             |  537 +--
 tests/op/test_timvx_op_dropout.cpp            |   22 +-
 tests/op/test_timvx_op_eltwise_mul.cpp        |  409 +-
 tests/op/test_timvx_op_eltwise_sum.cpp        |  409 +-
 tests/op/test_timvx_op_elu.cpp                |   22 +-
 tests/op/test_timvx_op_fc.cpp                 |  401 +-
 tests/op/test_timvx_op_flatten.cpp            |  346 +-
 tests/op/test_timvx_op_gather.cpp             |  356 +-
 tests/op/test_timvx_op_hardswish.cpp          |   22 +-
 tests/op/test_timvx_op_interp.cpp             |  364 +-
 tests/op/test_timvx_op_leakyrelu.cpp          |   24 +-
 tests/op/test_timvx_op_mish.cpp               |   21 +-
 tests/op/test_timvx_op_permute.cpp            |  361 +-
 tests/op/test_timvx_op_pooling.cpp            |   26 +-
 tests/op/test_timvx_op_prelu.cpp              |   24 +-
 tests/op/test_timvx_op_relu.cpp               |   22 +-
 tests/op/test_timvx_op_relu1.cpp              |   22 +-
 tests/op/test_timvx_op_reshape.cpp            |  372 +-
 tests/op/test_timvx_op_resize.cpp             |  360 +-
 tests/op/test_timvx_op_sigmoid.cpp            |   22 +-
 tests/op/test_timvx_op_slice.cpp              |  354 +-
 tests/op/test_timvx_op_softmax.cpp            |  344 +-
 tests/op/test_timvx_op_split.cpp              |  424 +-
 tests/op/test_timvx_op_tanh.cpp               |   21 +-
 tests/op/test_timvx_op_transpose.cpp          |  406 +-
 tests/op/test_timvx_op_upsampling.cpp         |  356 +-
 tools/convert_tool/caffe/caffe2tengine.cpp    |  177 +-
 tools/convert_tool/caffe/caffe2tengine.hpp    |   33 +-
 tools/convert_tool/convert_tool.cpp           |   57 +-
 tools/convert_tool/ncnn/ncnn2tengine.cpp      |  311 +-
 tools/convert_tool/ncnn/ncnn2tengine.hpp      |   39 +-
 tools/convert_tool/onnx/onnx2tengine.cpp      |  564 ++-
 tools/convert_tool/onnx/onnx2tengine.hpp      |   31 +-
 .../utils/graph_optimizer/graph_opt.cpp       |  112 +-
 .../utils/graph_optimizer/graph_opt.hpp       |   29 +-
 .../utils/save_graph/save_graph.cpp           |   96 +-
 .../utils/save_graph/save_graph.hpp           |    5 +-
 .../utils/save_graph/tm2_generate.c           |    2 +-
 .../utils/save_graph/tm2_op_save.cpp          |  483 ++-
 .../utils/save_graph/tm2_op_save.hpp          |   14 +-
 tools/quantize/compiler_fp16.h                |   20 +-
 tools/quantize/quant_save_graph.cpp           | 2044 +++++-----
 tools/quantize/quant_save_graph.hpp           |  106 +-
 tools/quantize/quant_tool.hpp                 |   32 +-
 tools/quantize/quant_tool_int8.cpp            |  144 +-
 tools/quantize/quant_tool_uint8.cpp           |  166 +-
 .../quantize/quant_tool_uint8_perchannel.cpp  | 1107 +++--
 tools/quantize/quant_utils.cpp                | 1093 +++--
 tools/quantize/quant_utils.hpp                |   97 +-
 tools/quantize/savegraph/save_graph.cpp       |   95 +-
 tools/quantize/savegraph/save_graph.hpp       |    5 +-
 tools/quantize/savegraph/tm2_format.h         |  594 +--
 tools/quantize/savegraph/tm2_generate.c       |    2 +-
 tools/quantize/savegraph/tm2_op_save.cpp      |  456 +--
 tools/quantize/savegraph/tm2_op_save.hpp      |   13 +-
 820 files changed, 28221 insertions(+), 29759 deletions(-)
 create mode 100644 .github/workflows/code-format.yml

diff --git a/.clang-format b/.clang-format
index a969255c2..3519ad950 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,170 +1,132 @@
-Language:               Cpp
-
-AccessModifierOffset:   -4
-
-AlignAfterOpenBracket:          Align
-AlignConsecutiveAssignments:    AcrossEmptyLines
-AlignConsecutiveBitFields:      AcrossEmptyLinesAndComments
-AlignConsecutiveDeclarations:   Consecutive
-AlignConsecutiveMacros:         AcrossEmptyLines
-AlignEscapedNewlines:           Right
-AlignOperands:                  true
-AlignTrailingComments:          true
-
-AllowAllArgumentsOnNextLine:                true
-AllowAllConstructorInitializersOnNextLine:  true
-AllowAllParametersOfDeclarationOnNextLine:  true
-
-AllowShortBlocksOnASingleLine:              Empty
-AllowShortCaseLabelsOnASingleLine:          false
-AllowShortEnumsOnASingleLine:               false
-AllowShortFunctionsOnASingleLine:           Empty
-AllowShortIfStatementsOnASingleLine:        Never
-AllowShortLambdasOnASingleLine:             Inline
-AllowShortLoopsOnASingleLine:               false
-
-# AlwaysBreakAfterDefinitionReturnType is deprecated
-AlwaysBreakAfterReturnType:                 None
-AlwaysBreakBeforeMultilineStrings:          false
-AlwaysBreakTemplateDeclarations:            Yes
-
-BinPackArguments:       true
-BinPackParameters:      true
-
-BitFieldColonSpacing:   Both
-
+# find src/ tools/ tests/ examples/ benchmark/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.h' | xargs -i clang-format -i {}
+
+# need clang-format >= 10.0
+
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+# AlignConsecutiveBitFields: true
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
+AlignEscapedNewlines: Left
+# AlignOperands: AlignAfterOperator
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Always
+AllowShortCaseLabelsOnASingleLine: true
+# AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: WithoutElse
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
 BraceWrapping:
-    AfterCaseLabel:         true
-    AfterClass:             false
-    AfterControlStatement:  Always
-    AfterEnum:              true
-    AfterFunction:          true
-    AfterNamespace:         true
-    AfterObjCDeclaration:   true
-    AfterStruct:            true
-    AfterUnion:             true
-    AfterExternBlock:       true
-    BeforeCatch:            true
-    BeforeElse:             true
-    BeforeLambdaBody:       false
-    BeforeWhile:            true
-    IndentBraces:           false
-    SplitEmptyFunction:     false
-    SplitEmptyRecord:       false
-    SplitEmptyNamespace:    false
-BreakBeforeBinaryOperators:                 NonAssignment
-BreakBeforeBraces:                          Custom
-BreakBeforeConceptDeclarations:             true
-BreakBeforeTernaryOperators:                false
-BreakConstructorInitializers:               BeforeColon
-BreakConstructorInitializersBeforeComma:    false
-BreakInheritanceList:                       BeforeColon
-BreakStringLiterals:                        false
-
-ColumnLimit:        120
-
-CommentPragmas:     '^ AYU pragma:'
-
-CompactNamespaces:  false
-
-ConstructorInitializerAllOnOneLineOrOnePerLine:     false
-ConstructorInitializerIndentWidth:                  4
-
-ContinuationIndentWidth:        4
-
-Cpp11BracedListStyle:       false
-
-DeriveLineEnding:           true
-DerivePointerAlignment:     false
-
-DisableFormat:              false
-
-ExperimentalAutoDetectBinPacking: false
-
-FixNamespaceComments:       true
-
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
-
-IncludeCategories: 
-  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
-    Priority:        2
-  - Regex:           '^(<|"(gtest|isl|json)/)'
-    Priority:        3
-  - Regex:           '.*'
-    Priority:        1
-
-IndentCaseBlocks:           true
-IndentCaseLabels:           false
-IndentExternBlock:          AfterExternBlock
-IndentGotoLabels:           false
-IndentPPDirectives:         BeforeHash
-IndentRequires:             true
-IndentWidth:                4
-IndentWrappedFunctionNames: false
-
-InsertTrailingCommas:       Wrapped
-
-KeepEmptyLinesAtTheStartOfBlocks:   false
-
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-
-MaxEmptyLinesToKeep: 3
-
+  AfterCaseLabel: true
+  AfterClass: true
+  AfterControlStatement: Always
+  AfterEnum: true
+  AfterFunction: true
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: true
+  AfterUnion: true
+  AfterExternBlock: false
+  BeforeCatch: true
+  BeforeElse: true
+#  BeforeLambdaBody: false
+#  BeforeWhile: false
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: false
+BreakAfterJavaFieldAnnotations: true
+BreakBeforeBinaryOperators: All
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: false
+ColumnLimit: 0
+# CommentPragmas:
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: false
+DerivePointerAlignment: false
+# DisableFormat:
+# ExperimentalAutoDetectBinPacking:
+FixNamespaceComments: true
+# ForEachMacros:
+IncludeBlocks: Regroup
+# IncludeCategories:
+# IncludeIsMainRegex:
+# IncludeIsMainSourceRegex:
+# IndentCaseBlocks: false
+IndentCaseLabels: false
+# IndentExternBlock: NoIndent
+IndentGotoLabels: false
+IndentPPDirectives: None
+IndentWidth: 4
+# IndentWrappedFunctionNames: 4
+# InsertTrailingCommas: None
+# JavaImportGroups:
+# JavaScriptQuotes
+# JavaScriptWrapImports:
+KeepEmptyLinesAtTheStartOfBlocks: false
+Language: Cpp
+# MacroBlockBegin:
+# MacroBlockEnd:
+MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
-
-ObjCBinPackProtocolList:            Auto
-ObjCBlockIndentWidth:               4
-ObjCBreakBeforeNestedBlockParam:    false
-ObjCSpaceAfterProperty:             false
-ObjCSpaceBeforeProtocolList:        true
-
-PenaltyBreakBeforeFirstCallParameter:   19
-PenaltyBreakComment:                    300
-PenaltyBreakFirstLessLess:              120
-PenaltyBreakString:                     1000
-PenaltyExcessCharacter:                 1000000
-PenaltyReturnTypeOnItsOwnLine:          60
-
-PointerAlignment:   Left
-
-#RawStringFormats:
-
-ReflowComments:     false
-
-SortIncludes:           Never
-SortUsingDeclarations:  false
-
-SpaceAfterCStyleCast:               false
-SpaceAfterLogicalNot:               false
-SpaceAfterTemplateKeyword:          true
-SpaceAroundPointerQualifiers:       Before
-SpaceBeforeAssignmentOperators:     true
-SpaceBeforeCaseColon:               false
-SpaceBeforeCpp11BracedList:         true
-SpaceBeforeCtorInitializerColon:    true
-SpaceBeforeInheritanceColon:        true
-SpaceBeforeParens:                  ControlStatements
-SpaceBeforeRangeBasedForLoopColon:  true
-SpaceBeforeSquareBrackets:          false
-SpaceInEmptyBlock:                  false
-SpaceInEmptyParentheses:            false
-SpacesBeforeTrailingComments:       4
-SpacesInAngles:                     false
-SpacesInCStyleCastParentheses:      false
-SpacesInConditionalStatement:       false
-SpacesInContainerLiterals:          true
-SpacesInParentheses:                false
-SpacesInSquareBrackets:             false
-
-Standard:       c++11
-
-TabWidth:       4
-
-UseCRLF:        false
-
-UseTab:         Never
-
-# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
-# https://www.cnblogs.com/PaulpauL/p/5929753.html
-# https://my.oschina.net/u/4393102/blog/3349736
+# NamespaceMacros:
+# ObjCBinPackProtocolList:
+# ObjCBlockIndentWidth:
+# ObjCBreakBeforeNestedBlockParam:
+# ObjCSpaceAfterProperty:
+# ObjCSpaceBeforeProtocolList:
+# PenaltyBreakAssignment:
+# PenaltyBreakBeforeFirstCallParameter:
+# PenaltyBreakComment:
+# PenaltyBreakFirstLessLess:
+# PenaltyBreakString:
+# PenaltyBreakTemplateDeclaration:
+# PenaltyExcessCharacter:
+# PenaltyReturnTypeOnItsOwnLine:
+PointerAlignment: Left
+# RawStringFormats:
+ReflowComments: false
+SortIncludes: false
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: c++03
+#StatementMacros:
+TabWidth: 4
+# TypenameMacros:
+UseCRLF: false
+UseTab: Never
diff --git a/.github/workflows/code-format.yml b/.github/workflows/code-format.yml
new file mode 100644
index 000000000..60441b8da
--- /dev/null
+++ b/.github/workflows/code-format.yml
@@ -0,0 +1,21 @@
+name: code-format
+
+on: [push, pull_request, pull_request_target]
+
+jobs:
+  code-format:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: apt
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y dos2unix clang-format-12
+        sudo update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-12 120
+    - name: code-format
+      run: |
+        python scripts/clang-format-all.py
+    - uses: stefanzweifel/git-auto-commit-action@v4
+      with:
+        commit_message: apply code-format changes
diff --git a/benchmark/common/cmdline.hpp b/benchmark/common/cmdline.hpp
index 5b88c778a..b26c944c3 100644
--- a/benchmark/common/cmdline.hpp
+++ b/benchmark/common/cmdline.hpp
@@ -43,778 +43,914 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace cmdline {
 
-    namespace detail {
-
-        template <typename Target, typename Source, bool Same>
-        class lexical_cast_t {
-        public:
-            static Target cast(const Source &arg) {
-                Target ret;
-                std::stringstream ss;
-                if (!(ss << arg && ss >> ret && ss.eof()))
-                    throw std::bad_cast();
-
-                return ret;
-            }
-        };
+namespace detail {
 
-        template <typename Target, typename Source>
-        class lexical_cast_t<Target, Source, true> {
-        public:
-            static Target cast(const Source &arg) {
-                return arg;
-            }
-        };
-
-        template <typename Source>
-        class lexical_cast_t<std::string, Source, false> {
-        public:
-            static std::string cast(const Source &arg) {
-                std::ostringstream ss;
-                ss << arg;
-                return ss.str();
-            }
-        };
-
-        template <typename Target>
-        class lexical_cast_t<Target, std::string, false> {
-        public:
-            static Target cast(const std::string &arg) {
-                Target ret;
-                std::istringstream ss(arg);
-                if (!(ss >> ret && ss.eof()))
-                    throw std::bad_cast();
-                return ret;
-            }
-        };
+template<typename Target, typename Source, bool Same>
+class lexical_cast_t
+{
+public:
+    static Target cast(const Source& arg)
+    {
+        Target ret;
+        std::stringstream ss;
+        if (!(ss << arg && ss >> ret && ss.eof()))
+            throw std::bad_cast();
 
-        template <typename T1, typename T2>
-        struct is_same {
-            static const bool value = false;
-        };
+        return ret;
+    }
+};
 
-        template <typename T>
-        struct is_same<T, T> {
-            static const bool value = true;
-        };
+template<typename Target, typename Source>
+class lexical_cast_t<Target, Source, true>
+{
+public:
+    static Target cast(const Source& arg)
+    {
+        return arg;
+    }
+};
 
-        template<typename Target, typename Source>
-        Target lexical_cast(const Source &arg)
-        {
-            return lexical_cast_t<Target, Source, detail::is_same<Target, Source>::value>::cast(arg);
-        }
+template<typename Source>
+class lexical_cast_t<std::string, Source, false>
+{
+public:
+    static std::string cast(const Source& arg)
+    {
+        std::ostringstream ss;
+        ss << arg;
+        return ss.str();
+    }
+};
 
-        static inline std::string demangle(const std::string &name)
-        {
+template<typename Target>
+class lexical_cast_t<Target, std::string, false>
+{
+public:
+    static Target cast(const std::string& arg)
+    {
+        Target ret;
+        std::istringstream ss(arg);
+        if (!(ss >> ret && ss.eof()))
+            throw std::bad_cast();
+        return ret;
+    }
+};
+
+template<typename T1, typename T2>
+struct is_same
+{
+    static const bool value = false;
+};
+
+template<typename T>
+struct is_same<T, T>
+{
+    static const bool value = true;
+};
+
+template<typename Target, typename Source>
+Target lexical_cast(const Source& arg)
+{
+    return lexical_cast_t<Target, Source, detail::is_same<Target, Source>::value>::cast(arg);
+}
+
+static inline std::string demangle(const std::string& name)
+{
 #ifdef _MSC_VER
-            return name; // MSVC return name
-#elif defined(__GNUC__) 
-            // call the original methods when compiler is GCC
-            int status = 0;
-            char *p = abi::__cxa_demangle(name.c_str(), 0, 0, &status);
-            std::string ret(p);
-            free(p);
-            return ret;
+    return name; // MSVC return name
+#elif defined(__GNUC__)
+    // call the original methods when compiler is GCC
+    int status = 0;
+    char* p = abi::__cxa_demangle(name.c_str(), 0, 0, &status);
+    std::string ret(p);
+    free(p);
+    return ret;
 #else
-            // other compiler need more work
+    // other compiler need more work
 #error unexpected c complier (msc/gcc), Need to implement this method for demangle
 #endif
-        }
-
-        template <class T>
-        std::string readable_typename()
-        {
-            return demangle(typeid(T).name());
-        }
-
-        template <class T>
-        std::string default_value(T def)
-        {
-            return detail::lexical_cast<std::string>(def);
-        }
-
-        template <>
-        inline std::string readable_typename<std::string>()
-        {
-            return "string";
-        }
-
-    } // detail
-
-    //-----
-
-    class cmdline_error : public std::exception {
-    public:
-        cmdline_error(const std::string &msg) : msg(msg) {}
-        ~cmdline_error() throw() {}
-        const char *what() const throw() { return msg.c_str(); }
-    private:
-        std::string msg;
-    };
-
-    template <class T>
-    struct default_reader {
-        T operator()(const std::string &str) {
-            return detail::lexical_cast<T>(str);
-        }
-    };
-
-    template <class T>
-    struct range_reader {
-        range_reader(const T &low, const T &high) : low(low), high(high) {}
-        T operator()(const std::string &s) const {
-            T ret = default_reader<T>()(s);
-            if (!(ret >= low && ret <= high)) throw cmdline::cmdline_error("range_error");
-            return ret;
-        }
-    private:
-        T low, high;
-    };
-
-    template <class T>
-    range_reader<T> range(const T &low, const T &high)
+}
+
+template<class T>
+std::string readable_typename()
+{
+    return demangle(typeid(T).name());
+}
+
+template<class T>
+std::string default_value(T def)
+{
+    return detail::lexical_cast<std::string>(def);
+}
+
+template<>
+inline std::string readable_typename<std::string>()
+{
+    return "string";
+}
+
+} // namespace detail
+
+//-----
+
+class cmdline_error : public std::exception
+{
+public:
+    cmdline_error(const std::string& msg)
+        : msg(msg)
+    {
+    }
+    ~cmdline_error() throw()
+    {
+    }
+    const char* what() const throw()
     {
-        return range_reader<T>(low, high);
+        return msg.c_str();
     }
 
-    template <class T>
-    struct oneof_reader {
-        T operator()(const std::string &s) {
-            T ret = default_reader<T>()(s);
-            if (std::find(alt.begin(), alt.end(), ret) == alt.end())
-                throw cmdline_error("");
-            return ret;
-        }
-        void add(const T &v) { alt.push_back(v); }
-    private:
-        std::vector<T> alt;
-    };
+private:
+    std::string msg;
+};
 
-    template <class T>
-    oneof_reader<T> oneof(T a1)
+template<class T>
+struct default_reader
+{
+    T operator()(const std::string& str)
     {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        return ret;
+        return detail::lexical_cast<T>(str);
     }
+};
 
-    template <class T>
-    oneof_reader<T> oneof(T a1, T a2)
+template<class T>
+struct range_reader
+{
+    range_reader(const T& low, const T& high)
+        : low(low), high(high)
     {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        ret.add(a2);
-        return ret;
     }
-
-    template <class T>
-    oneof_reader<T> oneof(T a1, T a2, T a3)
+    T operator()(const std::string& s) const
     {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        ret.add(a2);
-        ret.add(a3);
+        T ret = default_reader<T>()(s);
+        if (!(ret >= low && ret <= high)) throw cmdline::cmdline_error("range_error");
         return ret;
     }
 
-    template <class T>
-    oneof_reader<T> oneof(T a1, T a2, T a3, T a4)
+private:
+    T low, high;
+};
+
+template<class T>
+range_reader<T> range(const T& low, const T& high)
+{
+    return range_reader<T>(low, high);
+}
+
+template<class T>
+struct oneof_reader
+{
+    T operator()(const std::string& s)
     {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        ret.add(a2);
-        ret.add(a3);
-        ret.add(a4);
+        T ret = default_reader<T>()(s);
+        if (std::find(alt.begin(), alt.end(), ret) == alt.end())
+            throw cmdline_error("");
         return ret;
     }
-
-    template <class T>
-    oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5)
+    void add(const T& v)
     {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        ret.add(a2);
-        ret.add(a3);
-        ret.add(a4);
-        ret.add(a5);
-        return ret;
+        alt.push_back(v);
     }
 
-    template <class T>
-    oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6)
+private:
+    std::vector<T> alt;
+};
+
+template<class T>
+oneof_reader<T> oneof(T a1)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    return ret;
+}
+
+template<class T>
+oneof_reader<T> oneof(T a1, T a2)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    ret.add(a2);
+    return ret;
+}
+
+template<class T>
+oneof_reader<T> oneof(T a1, T a2, T a3)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    ret.add(a2);
+    ret.add(a3);
+    return ret;
+}
+
+template<class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    ret.add(a2);
+    ret.add(a3);
+    ret.add(a4);
+    return ret;
+}
+
+template<class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    ret.add(a2);
+    ret.add(a3);
+    ret.add(a4);
+    ret.add(a5);
+    return ret;
+}
+
+template<class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    ret.add(a2);
+    ret.add(a3);
+    ret.add(a4);
+    ret.add(a5);
+    ret.add(a6);
+    return ret;
+}
+
+template<class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    ret.add(a2);
+    ret.add(a3);
+    ret.add(a4);
+    ret.add(a5);
+    ret.add(a6);
+    ret.add(a7);
+    return ret;
+}
+
+template<class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    ret.add(a2);
+    ret.add(a3);
+    ret.add(a4);
+    ret.add(a5);
+    ret.add(a6);
+    ret.add(a7);
+    ret.add(a8);
+    return ret;
+}
+
+template<class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    ret.add(a2);
+    ret.add(a3);
+    ret.add(a4);
+    ret.add(a5);
+    ret.add(a6);
+    ret.add(a7);
+    ret.add(a8);
+    ret.add(a9);
+    return ret;
+}
+
+template<class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10)
+{
+    oneof_reader<T> ret;
+    ret.add(a1);
+    ret.add(a2);
+    ret.add(a3);
+    ret.add(a4);
+    ret.add(a5);
+    ret.add(a6);
+    ret.add(a7);
+    ret.add(a8);
+    ret.add(a9);
+    ret.add(a10);
+    return ret;
+}
+
+//-----
+
+class parser
+{
+public:
+    parser()
     {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        ret.add(a2);
-        ret.add(a3);
-        ret.add(a4);
-        ret.add(a5);
-        ret.add(a6);
-        return ret;
+    }
+    ~parser()
+    {
+        for (std::map<std::string, option_base*>::iterator p = options.begin();
+             p != options.end(); p++)
+            delete p->second;
     }
 
-    template <class T>
-    oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7)
+    void add(const std::string& name,
+             char short_name = 0,
+             const std::string& desc = "")
     {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        ret.add(a2);
-        ret.add(a3);
-        ret.add(a4);
-        ret.add(a5);
-        ret.add(a6);
-        ret.add(a7);
-        return ret;
+        if (options.count(name)) throw cmdline_error("multiple definition: " + name);
+        options[name] = new option_without_value(name, short_name, desc);
+        ordered.push_back(options[name]);
     }
 
-    template <class T>
-    oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8)
+    template<class T>
+    void add(const std::string& name,
+             char short_name = 0,
+             const std::string& desc = "",
+             bool need = true,
+             const T def = T())
     {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        ret.add(a2);
-        ret.add(a3);
-        ret.add(a4);
-        ret.add(a5);
-        ret.add(a6);
-        ret.add(a7);
-        ret.add(a8);
-        return ret;
+        add(name, short_name, desc, need, def, default_reader<T>());
     }
 
-    template <class T>
-    oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9)
+    template<class T, class F>
+    void add(const std::string& name,
+             char short_name = 0,
+             const std::string& desc = "",
+             bool need = true,
+             const T def = T(),
+             F reader = F())
     {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        ret.add(a2);
-        ret.add(a3);
-        ret.add(a4);
-        ret.add(a5);
-        ret.add(a6);
-        ret.add(a7);
-        ret.add(a8);
-        ret.add(a9);
-        return ret;
+        if (options.count(name)) throw cmdline_error("multiple definition: " + name);
+        options[name] = new option_with_value_with_reader<T, F>(name, short_name, need, def, desc, reader);
+        ordered.push_back(options[name]);
     }
 
-    template <class T>
-    oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10)
-    {
-        oneof_reader<T> ret;
-        ret.add(a1);
-        ret.add(a2);
-        ret.add(a3);
-        ret.add(a4);
-        ret.add(a5);
-        ret.add(a6);
-        ret.add(a7);
-        ret.add(a8);
-        ret.add(a9);
-        ret.add(a10);
-        return ret;
+    void footer(const std::string& f)
+    {
+        ftr = f;
     }
 
-    //-----
+    void set_program_name(const std::string& name)
+    {
+        prog_name = name;
+    }
 
-    class parser {
-    public:
-        parser() {
-        }
-        ~parser() {
-            for (std::map<std::string, option_base*>::iterator p = options.begin();
-                p != options.end(); p++)
-                delete p->second;
-        }
+    bool exist(const std::string& name) const
+    {
+        if (options.count(name) == 0) throw cmdline_error("there is no flag: --" + name);
+        return options.find(name)->second->has_set();
+    }
 
-        void add(const std::string &name,
-            char short_name = 0,
-            const std::string &desc = "") {
-            if (options.count(name)) throw cmdline_error("multiple definition: " + name);
-            options[name] = new option_without_value(name, short_name, desc);
-            ordered.push_back(options[name]);
-        }
+    template<class T>
+    const T& get(const std::string& name) const
+    {
+        if (options.count(name) == 0) throw cmdline_error("there is no flag: --" + name);
+        const option_with_value<T>* p = dynamic_cast<const option_with_value<T>*>(options.find(name)->second);
+        if (p == NULL) throw cmdline_error("type mismatch flag '" + name + "'");
+        return p->get();
+    }
 
-        template <class T>
-        void add(const std::string &name,
-            char short_name = 0,
-            const std::string &desc = "",
-            bool need = true,
-            const T def = T()) {
-            add(name, short_name, desc, need, def, default_reader<T>());
-        }
+    const std::vector<std::string>& rest() const
+    {
+        return others;
+    }
 
-        template <class T, class F>
-        void add(const std::string &name,
-            char short_name = 0,
-            const std::string &desc = "",
-            bool need = true,
-            const T def = T(),
-            F reader = F()) {
-            if (options.count(name)) throw cmdline_error("multiple definition: " + name);
-            options[name] = new option_with_value_with_reader<T, F>(name, short_name, need, def, desc, reader);
-            ordered.push_back(options[name]);
-        }
+    bool parse(const std::string& arg)
+    {
+        std::vector<std::string> args;
 
-        void footer(const std::string &f) {
-            ftr = f;
-        }
+        std::string buf;
+        bool in_quote = false;
+        for (std::string::size_type i = 0; i < arg.length(); i++)
+        {
+            if (arg[i] == '\"')
+            {
+                in_quote = !in_quote;
+                continue;
+            }
 
-        void set_program_name(const std::string &name) {
-            prog_name = name;
-        }
+            if (arg[i] == ' ' && !in_quote)
+            {
+                args.push_back(buf);
+                buf = "";
+                continue;
+            }
 
-        bool exist(const std::string &name) const {
-            if (options.count(name) == 0) throw cmdline_error("there is no flag: --" + name);
-            return options.find(name)->second->has_set();
-        }
+            if (arg[i] == '\\')
+            {
+                i++;
+                if (i >= arg.length())
+                {
+                    errors.push_back("unexpected occurrence of '\\' at end of string");
+                    return false;
+                }
+            }
 
-        template <class T>
-        const T &get(const std::string &name) const {
-            if (options.count(name) == 0) throw cmdline_error("there is no flag: --" + name);
-            const option_with_value<T> *p = dynamic_cast<const option_with_value<T>*>(options.find(name)->second);
-            if (p == NULL) throw cmdline_error("type mismatch flag '" + name + "'");
-            return p->get();
+            buf += arg[i];
         }
 
-        const std::vector<std::string> &rest() const {
-            return others;
+        if (in_quote)
+        {
+            errors.push_back("quote is not closed");
+            return false;
         }
 
-        bool parse(const std::string &arg) {
-            std::vector<std::string> args;
+        if (buf.length() > 0)
+            args.push_back(buf);
 
-            std::string buf;
-            bool in_quote = false;
-            for (std::string::size_type i = 0; i < arg.length(); i++) {
-                if (arg[i] == '\"') {
-                    in_quote = !in_quote;
-                    continue;
-                }
+        for (size_t i = 0; i < args.size(); i++)
+            std::cout << "\"" << args[i] << "\"" << std::endl;
 
-                if (arg[i] == ' ' && !in_quote) {
-                    args.push_back(buf);
-                    buf = "";
-                    continue;
-                }
-
-                if (arg[i] == '\\') {
-                    i++;
-                    if (i >= arg.length()) {
-                        errors.push_back("unexpected occurrence of '\\' at end of string");
-                        return false;
-                    }
-                }
-
-                buf += arg[i];
-            }
-
-            if (in_quote) {
-                errors.push_back("quote is not closed");
-                return false;
-            }
-
-            if (buf.length() > 0)
-                args.push_back(buf);
+        return parse(args);
+    }
 
-            for (size_t i = 0; i < args.size(); i++)
-                std::cout << "\"" << args[i] << "\"" << std::endl;
+    bool parse(const std::vector<std::string>& args)
+    {
+        int argc = static_cast<int>(args.size());
+        std::vector<const char*> argv(argc);
 
-            return parse(args);
-        }
+        for (int i = 0; i < argc; i++)
+            argv[i] = args[i].c_str();
 
-        bool parse(const std::vector<std::string> &args) {
-            int argc = static_cast<int>(args.size());
-            std::vector<const char*> argv(argc);
+        return parse(argc, &argv[0]);
+    }
 
-            for (int i = 0; i < argc; i++)
-                argv[i] = args[i].c_str();
+    bool parse(int argc, const char* const argv[])
+    {
+        errors.clear();
+        others.clear();
 
-            return parse(argc, &argv[0]);
+        if (argc < 1)
+        {
+            errors.push_back("argument number must be longer than 0");
+            return false;
         }
+        if (prog_name == "")
+            prog_name = argv[0];
 
-        bool parse(int argc, const char * const argv[]) {
-            errors.clear();
-            others.clear();
-
-            if (argc < 1) {
-                errors.push_back("argument number must be longer than 0");
-                return false;
-            }
-            if (prog_name == "")
-                prog_name = argv[0];
-
-            std::map<char, std::string> lookup;
-            for (std::map<std::string, option_base*>::iterator p = options.begin();
-                p != options.end(); p++) {
-                if (p->first.length() == 0) continue;
-                char initial = p->second->short_name();
-                if (initial) {
-                    if (lookup.count(initial) > 0) {
-                        lookup[initial] = "";
-                        errors.push_back(std::string("short option '") + initial + "' is ambiguous");
-                        return false;
-                    }
-                    else lookup[initial] = p->first;
+        std::map<char, std::string> lookup;
+        for (std::map<std::string, option_base*>::iterator p = options.begin();
+             p != options.end(); p++)
+        {
+            if (p->first.length() == 0) continue;
+            char initial = p->second->short_name();
+            if (initial)
+            {
+                if (lookup.count(initial) > 0)
+                {
+                    lookup[initial] = "";
+                    errors.push_back(std::string("short option '") + initial + "' is ambiguous");
+                    return false;
                 }
+                else
+                    lookup[initial] = p->first;
             }
+        }
 
-            for (int i = 1; i < argc; i++) {
-                if (strncmp(argv[i], "--", 2) == 0) {
-                    const char *p = strchr(argv[i] + 2, '=');
-                    if (p) {
-                        std::string name(argv[i] + 2, p);
-                        std::string val(p + 1);
-                        set_option(name, val);
+        for (int i = 1; i < argc; i++)
+        {
+            if (strncmp(argv[i], "--", 2) == 0)
+            {
+                const char* p = strchr(argv[i] + 2, '=');
+                if (p)
+                {
+                    std::string name(argv[i] + 2, p);
+                    std::string val(p + 1);
+                    set_option(name, val);
+                }
+                else
+                {
+                    std::string name(argv[i] + 2);
+                    if (options.count(name) == 0)
+                    {
+                        errors.push_back("undefined option: --" + name);
+                        continue;
                     }
-                    else {
-                        std::string name(argv[i] + 2);
-                        if (options.count(name) == 0) {
-                            errors.push_back("undefined option: --" + name);
+                    if (options[name]->has_value())
+                    {
+                        if (i + 1 >= argc)
+                        {
+                            errors.push_back("option needs value: --" + name);
                             continue;
                         }
-                        if (options[name]->has_value()) {
-                            if (i + 1 >= argc) {
-                                errors.push_back("option needs value: --" + name);
-                                continue;
-                            }
-                            else {
-                                i++;
-                                set_option(name, argv[i]);
-                            }
-                        }
-                        else {
-                            set_option(name);
+                        else
+                        {
+                            i++;
+                            set_option(name, argv[i]);
                         }
                     }
-                }
-                else if (strncmp(argv[i], "-", 1) == 0) {
-                    if (!argv[i][1]) continue;
-                    char last = argv[i][1];
-                    for (int j = 2; argv[i][j]; j++) {
-                        last = argv[i][j];
-                        if (lookup.count(argv[i][j - 1]) == 0) {
-                            errors.push_back(std::string("undefined short option: -") + argv[i][j - 1]);
-                            continue;
-                        }
-                        if (lookup[argv[i][j - 1]] == "") {
-                            errors.push_back(std::string("ambiguous short option: -") + argv[i][j - 1]);
-                            continue;
-                        }
-                        set_option(lookup[argv[i][j - 1]]);
+                    else
+                    {
+                        set_option(name);
                     }
-
-                    if (lookup.count(last) == 0) {
-                        errors.push_back(std::string("undefined short option: -") + last);
+                }
+            }
+            else if (strncmp(argv[i], "-", 1) == 0)
+            {
+                if (!argv[i][1]) continue;
+                char last = argv[i][1];
+                for (int j = 2; argv[i][j]; j++)
+                {
+                    last = argv[i][j];
+                    if (lookup.count(argv[i][j - 1]) == 0)
+                    {
+                        errors.push_back(std::string("undefined short option: -") + argv[i][j - 1]);
                         continue;
                     }
-                    if (lookup[last] == "") {
-                        errors.push_back(std::string("ambiguous short option: -") + last);
+                    if (lookup[argv[i][j - 1]] == "")
+                    {
+                        errors.push_back(std::string("ambiguous short option: -") + argv[i][j - 1]);
                         continue;
                     }
+                    set_option(lookup[argv[i][j - 1]]);
+                }
 
-                    if (i + 1 < argc && options[lookup[last]]->has_value()) {
-                        set_option(lookup[last], argv[i + 1]);
-                        i++;
-                    }
-                    else {
-                        set_option(lookup[last]);
-                    }
+                if (lookup.count(last) == 0)
+                {
+                    errors.push_back(std::string("undefined short option: -") + last);
+                    continue;
+                }
+                if (lookup[last] == "")
+                {
+                    errors.push_back(std::string("ambiguous short option: -") + last);
+                    continue;
+                }
+
+                if (i + 1 < argc && options[lookup[last]]->has_value())
+                {
+                    set_option(lookup[last], argv[i + 1]);
+                    i++;
                 }
-                else {
-                    others.push_back(argv[i]);
+                else
+                {
+                    set_option(lookup[last]);
                 }
             }
+            else
+            {
+                others.push_back(argv[i]);
+            }
+        }
+
+        for (std::map<std::string, option_base*>::iterator p = options.begin();
+             p != options.end(); p++)
+            if (!p->second->valid())
+                errors.push_back("need option: --" + std::string(p->first));
+
+        return errors.size() == 0;
+    }
+
+    void parse_check(const std::string& arg)
+    {
+        if (!options.count("help"))
+            add("help", '?', "print this message");
+        check(0, parse(arg));
+    }
+
+    void parse_check(const std::vector<std::string>& args)
+    {
+        if (!options.count("help"))
+            add("help", '?', "print this message");
+        check((int)(args.size()), parse(args));
+    }
+
+    void parse_check(int argc, char* argv[])
+    {
+        if (!options.count("help"))
+            add("help", '?', "print this message");
+        check(argc, parse(argc, argv));
+    }
 
-            for (std::map<std::string, option_base*>::iterator p = options.begin();
-                p != options.end(); p++)
-                if (!p->second->valid())
-                    errors.push_back("need option: --" + std::string(p->first));
+    std::string error() const
+    {
+        return errors.size() > 0 ? errors[0] : "";
+    }
+
+    std::string error_full() const
+    {
+        std::ostringstream oss;
+        for (size_t i = 0; i < errors.size(); i++)
+            oss << errors[i] << std::endl;
+        return oss.str();
+    }
 
-            return errors.size() == 0;
+    std::string usage() const
+    {
+        std::ostringstream oss;
+        oss << "usage: " << prog_name << " ";
+        for (size_t i = 0; i < ordered.size(); i++)
+        {
+            if (ordered[i]->must())
+                oss << ordered[i]->short_description() << " ";
         }
 
-        void parse_check(const std::string &arg) {
-            if (!options.count("help"))
-                add("help", '?', "print this message");
-            check(0, parse(arg));
+        oss << "[options] ... " << ftr << std::endl;
+        oss << "options:" << std::endl;
+
+        size_t max_width = 0;
+        for (size_t i = 0; i < ordered.size(); i++)
+        {
+            max_width = std::max(max_width, ordered[i]->name().length());
         }
+        for (size_t i = 0; i < ordered.size(); i++)
+        {
+            if (ordered[i]->short_name())
+            {
+                oss << "  -" << ordered[i]->short_name() << ", ";
+            }
+            else
+            {
+                oss << "      ";
+            }
 
-        void parse_check(const std::vector<std::string> &args) {
-            if (!options.count("help"))
-                add("help", '?', "print this message");
-            check((int)(args.size()), parse(args));
+            oss << "--" << ordered[i]->name();
+            for (size_t j = ordered[i]->name().length(); j < max_width + 4; j++)
+                oss << ' ';
+            oss << ordered[i]->description() << std::endl;
         }
+        return oss.str();
+    }
 
-        void parse_check(int argc, char *argv[]) {
-            if (!options.count("help"))
-                add("help", '?', "print this message");
-            check(argc, parse(argc, argv));
+private:
+    void check(int argc, bool ok)
+    {
+        if ((argc == 1 && !ok) || exist("help"))
+        {
+            std::cerr << usage();
+            exit(0);
         }
 
-        std::string error() const {
-            return errors.size() > 0 ? errors[0] : "";
+        if (!ok)
+        {
+            std::cerr << error() << std::endl
+                      << usage();
+            exit(1);
         }
+    }
 
-        std::string error_full() const {
-            std::ostringstream oss;
-            for (size_t i = 0; i < errors.size(); i++)
-                oss << errors[i] << std::endl;
-            return oss.str();
+    void set_option(const std::string& name)
+    {
+        if (options.count(name) == 0)
+        {
+            errors.push_back("undefined option: --" + name);
+            return;
         }
+        if (!options[name]->set())
+        {
+            errors.push_back("option needs value: --" + name);
+            return;
+        }
+    }
 
-        std::string usage() const {
-            std::ostringstream oss;
-            oss << "usage: " << prog_name << " ";
-            for (size_t i = 0; i < ordered.size(); i++) {
-                if (ordered[i]->must())
-                    oss << ordered[i]->short_description() << " ";
-            }
+    void set_option(const std::string& name, const std::string& value)
+    {
+        if (options.count(name) == 0)
+        {
+            errors.push_back("undefined option: --" + name);
+            return;
+        }
+        if (!options[name]->set(value))
+        {
+            errors.push_back("option value is invalid: --" + name + "=" + value);
+            return;
+        }
+    }
 
-            oss << "[options] ... " << ftr << std::endl;
-            oss << "options:" << std::endl;
+    class option_base
+    {
+    public:
+        virtual ~option_base()
+        {
+        }
 
-            size_t max_width = 0;
-            for (size_t i = 0; i < ordered.size(); i++) {
-                max_width = std::max(max_width, ordered[i]->name().length());
-            }
-            for (size_t i = 0; i < ordered.size(); i++) {
-                if (ordered[i]->short_name()) {
-                    oss << "  -" << ordered[i]->short_name() << ", ";
-                }
-                else {
-                    oss << "      ";
-                }
+        virtual bool has_value() const = 0;
+        virtual bool set() = 0;
+        virtual bool set(const std::string& value) = 0;
+        virtual bool has_set() const = 0;
+        virtual bool valid() const = 0;
+        virtual bool must() const = 0;
+
+        virtual const std::string& name() const = 0;
+        virtual char short_name() const = 0;
+        virtual const std::string& description() const = 0;
+        virtual std::string short_description() const = 0;
+    };
 
-                oss << "--" << ordered[i]->name();
-                for (size_t j = ordered[i]->name().length(); j < max_width + 4; j++)
-                    oss << ' ';
-                oss << ordered[i]->description() << std::endl;
-            }
-            return oss.str();
+    class option_without_value : public option_base
+    {
+    public:
+        option_without_value(const std::string& name,
+                             char short_name,
+                             const std::string& desc)
+            : nam(name), snam(short_name), desc(desc), has(false)
+        {
+        }
+        ~option_without_value()
+        {
         }
 
-    private:
-
-        void check(int argc, bool ok) {
-            if ((argc == 1 && !ok) || exist("help")) {
-                std::cerr << usage();
-                exit(0);
-            }
+        bool has_value() const
+        {
+            return false;
+        }
 
-            if (!ok) {
-                std::cerr << error() << std::endl << usage();
-                exit(1);
-            }
+        bool set()
+        {
+            has = true;
+            return true;
         }
 
-        void set_option(const std::string &name) {
-            if (options.count(name) == 0) {
-                errors.push_back("undefined option: --" + name);
-                return;
-            }
-            if (!options[name]->set()) {
-                errors.push_back("option needs value: --" + name);
-                return;
-            }
+        bool set(const std::string&)
+        {
+            return false;
         }
 
-        void set_option(const std::string &name, const std::string &value) {
-            if (options.count(name) == 0) {
-                errors.push_back("undefined option: --" + name);
-                return;
-            }
-            if (!options[name]->set(value)) {
-                errors.push_back("option value is invalid: --" + name + "=" + value);
-                return;
-            }
+        bool has_set() const
+        {
+            return has;
         }
 
-        class option_base {
-        public:
-            virtual ~option_base() {}
-
-            virtual bool has_value() const = 0;
-            virtual bool set() = 0;
-            virtual bool set(const std::string &value) = 0;
-            virtual bool has_set() const = 0;
-            virtual bool valid() const = 0;
-            virtual bool must() const = 0;
-
-            virtual const std::string &name() const = 0;
-            virtual char short_name() const = 0;
-            virtual const std::string &description() const = 0;
-            virtual std::string short_description() const = 0;
-        };
-
-        class option_without_value : public option_base {
-        public:
-            option_without_value(const std::string &name,
-                char short_name,
-                const std::string &desc)
-                :nam(name), snam(short_name), desc(desc), has(false) {
-            }
-            ~option_without_value() {}
+        bool valid() const
+        {
+            return true;
+        }
 
-            bool has_value() const { return false; }
+        bool must() const
+        {
+            return false;
+        }
 
-            bool set() {
-                has = true;
-                return true;
-            }
+        const std::string& name() const
+        {
+            return nam;
+        }
 
-            bool set(const std::string &) {
-                return false;
-            }
+        char short_name() const
+        {
+            return snam;
+        }
 
-            bool has_set() const {
-                return has;
-            }
+        const std::string& description() const
+        {
+            return desc;
+        }
 
-            bool valid() const {
-                return true;
-            }
+        std::string short_description() const
+        {
+            return "--" + nam;
+        }
 
-            bool must() const {
-                return false;
-            }
+    private:
+        std::string nam;
+        char snam;
+        std::string desc;
+        bool has;
+    };
 
-            const std::string &name() const {
-                return nam;
-            }
+    template<class T>
+    class option_with_value : public option_base
+    {
+    public:
+        option_with_value(const std::string& name,
+                          char short_name,
+                          bool need,
+                          const T& def,
+                          const std::string& desc)
+            : nam(name), snam(short_name), need(need), has(false), def(def), actual(def)
+        {
+            this->desc = full_description(desc);
+        }
+        ~option_with_value()
+        {
+        }
 
-            char short_name() const {
-                return snam;
-            }
+        const T& get() const
+        {
+            return actual;
+        }
 
-            const std::string &description() const {
-                return desc;
-            }
+        bool has_value() const
+        {
+            return true;
+        }
 
-            std::string short_description() const {
-                return "--" + nam;
-            }
+        bool set()
+        {
+            return false;
+        }
 
-        private:
-            std::string nam;
-            char snam;
-            std::string desc;
-            bool has;
-        };
-
-        template <class T>
-        class option_with_value : public option_base {
-        public:
-            option_with_value(const std::string &name,
-                char short_name,
-                bool need,
-                const T &def,
-                const std::string &desc)
-                : nam(name), snam(short_name), need(need), has(false)
-                , def(def), actual(def) {
-                this->desc = full_description(desc);
+        bool set(const std::string& value)
+        {
+            try
+            {
+                actual = read(value);
+                has = true;
             }
-            ~option_with_value() {}
-
-            const T &get() const {
-                return actual;
+            catch (const std::exception& e)
+            {
+                (void)e;
+                return false;
             }
+            return true;
+        }
 
-            bool has_value() const { return true; }
+        bool has_set() const
+        {
+            return has;
+        }
 
-            bool set() {
-                return false;
-            }
+        bool valid() const
+        {
+            if (need && !has) return false;
+            return true;
+        }
 
-            bool set(const std::string &value) {
-                try {
-                    actual = read(value);
-                    has = true;
-                }
-                catch (const std::exception &e) {
-                    (void)e;
-                    return false;
-                }
-                return true;
-            }
+        bool must() const
+        {
+            return need;
+        }
 
-            bool has_set() const {
-                return has;
-            }
+        const std::string& name() const
+        {
+            return nam;
+        }
 
-            bool valid() const {
-                if (need && !has) return false;
-                return true;
-            }
+        char short_name() const
+        {
+            return snam;
+        }
 
-            bool must() const {
-                return need;
-            }
+        const std::string& description() const
+        {
+            return desc;
+        }
 
-            const std::string &name() const {
-                return nam;
-            }
+        std::string short_description() const
+        {
+            return "--" + nam + "=" + detail::readable_typename<T>();
+        }
 
-            char short_name() const {
-                return snam;
-            }
+    protected:
+        std::string full_description(const std::string& desc_str)
+        {
+            return desc_str + " (" + detail::readable_typename<T>() + (need ? "" : " [=" + detail::default_value<T>(def) + "]")
+                   + ")";
+        }
 
-            const std::string &description() const {
-                return desc;
-            }
+        virtual T read(const std::string& s) = 0;
 
-            std::string short_description() const {
-                return "--" + nam + "=" + detail::readable_typename<T>();
-            }
+        std::string nam;
+        char snam;
+        bool need;
+        std::string desc;
 
-        protected:
-            std::string full_description(const std::string& desc_str) {
-                return
-                    desc_str + " (" + detail::readable_typename<T>() +
-                    (need ? "" : " [=" + detail::default_value<T>(def) + "]")
-                    + ")";
-            }
+        bool has;
+        T def;
+        T actual;
+    };
 
-            virtual T read(const std::string &s) = 0;
-
-            std::string nam;
-            char snam;
-            bool need;
-            std::string desc;
-
-            bool has;
-            T def;
-            T actual;
-        };
-
-        template <class T, class F>
-        class option_with_value_with_reader : public option_with_value<T> {
-        public:
-            option_with_value_with_reader(const std::string &name,
-                char short_name,
-                bool need,
-                const T def,
-                const std::string &desc,
-                F reader)
-                : option_with_value<T>(name, short_name, need, def, desc), reader(reader) {
-            }
+    template<class T, class F>
+    class option_with_value_with_reader : public option_with_value<T>
+    {
+    public:
+        option_with_value_with_reader(const std::string& name,
+                                      char short_name,
+                                      bool need,
+                                      const T def,
+                                      const std::string& desc,
+                                      F reader)
+            : option_with_value<T>(name, short_name, need, def, desc), reader(reader)
+        {
+        }
 
-        private:
-            T read(const std::string &s) {
-                return reader(s);
-            }
+    private:
+        T read(const std::string& s)
+        {
+            return reader(s);
+        }
 
-            F reader;
-        };
+        F reader;
+    };
 
-        std::map<std::string, option_base*> options;
-        std::vector<option_base*> ordered;
-        std::string ftr;
+    std::map<std::string, option_base*> options;
+    std::vector<option_base*> ordered;
+    std::string ftr;
 
-        std::string prog_name;
-        std::vector<std::string> others;
+    std::string prog_name;
+    std::vector<std::string> others;
 
-        std::vector<std::string> errors;
-    };
+    std::vector<std::string> errors;
+};
 
-} // cmdline
+} // namespace cmdline
diff --git a/examples/common/common.h b/examples/common/common.h
index 40a263aba..9ab861855 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -42,9 +42,9 @@
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
-#else    // _WIN32
+#else // _WIN32
 #include <sys/time.h>
-#endif    // _WIN32
+#endif // _WIN32
 
 #ifdef _WIN32
 static double get_current_time()
@@ -56,7 +56,7 @@ static double get_current_time()
 
     return pc.QuadPart * 1000.0 / freq.QuadPart;
 }
-#else    // _WIN32
+#else  // _WIN32
 
 static double get_current_time()
 {
@@ -65,7 +65,7 @@ static double get_current_time()
 
     return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
 }
-#endif    // _WIN32
+#endif // _WIN32
 
 static void split(float* array, char* str, const char* del)
 {
@@ -78,4 +78,4 @@ static void split(float* array, char* str, const char* del)
     }
 }
 
-#endif    // __COMMON_H__
+#endif // __COMMON_H__
diff --git a/examples/common/compiler_fp16.h b/examples/common/compiler_fp16.h
index 1857d7eec..d770707c2 100644
--- a/examples/common/compiler_fp16.h
+++ b/examples/common/compiler_fp16.h
@@ -48,7 +48,7 @@ extern "C" {
 
 #else
 #ifdef _MSC_VER
-#pragma  pack (push,1)
+#pragma pack(push, 1)
 struct fp16_pack
 {
     unsigned short frac : 10;
@@ -84,12 +84,12 @@ typedef struct fp16_pack __fp16;
 static inline float fp16_to_fp32(__fp16 data)
 {
     float f;
-    struct fp32_pack* fp32 = ( struct fp32_pack* )&f;
+    struct fp32_pack* fp32 = (struct fp32_pack*)&f;
     struct fp16_pack* fp16 = &data;
 
     int exp = fp16->exp;
 
-    if(exp == 31 && fp16->frac != 0)
+    if (exp == 31 && fp16->frac != 0)
     {
         // return __builtin_inf()-__builtin_inf();
         fp32->sign = fp16->sign;
@@ -99,28 +99,28 @@ static inline float fp16_to_fp32(__fp16 data)
         return f;
     }
 
-    if(exp == 31)
+    if (exp == 31)
         exp = 255;
-    if(exp == 0)
+    if (exp == 0)
         exp = 0;
     else
         exp = (exp - 15) + 127;
 
     fp32->exp = exp;
     fp32->sign = fp16->sign;
-    fp32->frac = (( int )fp16->frac) << 13;
+    fp32->frac = ((int)fp16->frac) << 13;
 
     return f;
 }
 
 static inline __fp16 fp32_to_fp16(float data)
 {
-    struct fp32_pack* fp32 = ( struct fp32_pack* )&data;
+    struct fp32_pack* fp32 = (struct fp32_pack*)&data;
     struct fp16_pack fp16;
 
     int exp = fp32->exp;
 
-    if(fp32->exp == 255 && fp32->frac != 0)
+    if (fp32->exp == 255 && fp32->frac != 0)
     {
         // NaN
         fp16.exp = 31;
@@ -130,9 +130,9 @@ static inline __fp16 fp32_to_fp16(float data)
         return fp16;
     }
 
-    if((exp - 127) < -14)
+    if ((exp - 127) < -14)
         exp = 0;
-    else if((exp - 127) > 15)
+    else if ((exp - 127) > 15)
         exp = 31;
     else
         exp = exp - 127 + 15;
diff --git a/examples/common/msc_getopt.h b/examples/common/msc_getopt.h
index 0cb88895d..caafad5b2 100644
--- a/examples/common/msc_getopt.h
+++ b/examples/common/msc_getopt.h
@@ -8,7 +8,7 @@
  * IMPLIED ARE HEREBY DISCLAIMED.  This includes but is not limited to 
  * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
- /*
+/*
  * Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
  *
  * Permission to use, copy, modify, and distribute this software for any
@@ -56,7 +56,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#pragma warning(disable:4996);
+#pragma warning(disable : 4996);
 
 #define __GETOPT_H__
 
@@ -73,16 +73,16 @@
 extern "C" {
 #endif
 
-#define	REPLACE_GETOPT		/* use this getopt as the system getopt(3) */
+#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */
 
 #ifdef REPLACE_GETOPT
-int	opterr = 1;		/* if error message should be printed */
-int	optind = 1;		/* index into parent argv vector */
-int	optopt = '?';		/* character checked for validity */
-#undef	optreset		/* see getopt.h */
-#define	optreset		__mingw_optreset
-int	optreset;		/* reset getopt */
-char    *optarg;		/* argument associated with option */
+int opterr = 1;   /* if error message should be printed */
+int optind = 1;   /* index into parent argv vector */
+int optopt = '?'; /* character checked for validity */
+#undef optreset   /* see getopt.h */
+#define optreset __mingw_optreset
+int optreset; /* reset getopt */
+char* optarg; /* argument associated with option */
 #endif
 
 //extern int optind;		/* index of first non-option in argv      */
@@ -92,37 +92,37 @@ char    *optarg;		/* argument associated with option */
 //
 //extern char *optarg;		/* pointer to argument of current option  */
 
-#define PRINT_ERROR	((opterr) && (*options != ':'))
+#define PRINT_ERROR ((opterr) && (*options != ':'))
 
-#define FLAG_PERMUTE	0x01	/* permute non-options to the end of argv */
-#define FLAG_ALLARGS	0x02	/* treat non-options as args to option "-1" */
-#define FLAG_LONGONLY	0x04	/* operate as getopt_long_only */
+#define FLAG_PERMUTE  0x01 /* permute non-options to the end of argv */
+#define FLAG_ALLARGS  0x02 /* treat non-options as args to option "-1" */
+#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */
 
 /* return values */
-#define	BADCH		(int)'?'
-#define	BADARG		((*options == ':') ? (int)':' : (int)'?')
-#define	INORDER 	(int)1
+#define BADCH   (int)'?'
+#define BADARG  ((*options == ':') ? (int)':' : (int)'?')
+#define INORDER (int)1
 
 #ifndef __CYGWIN__
 #define __progname __argv[0]
 #else
-extern char __declspec(dllimport) *__progname;
+extern char __declspec(dllimport) * __progname;
 #endif
 
 #ifdef __CYGWIN__
 static char EMSG[] = "";
 #else
-#define	EMSG		""
+#define EMSG ""
 #endif
 
-static int getopt_internal(int, char * const *, const char *,
-			   const struct option *, int *, int);
-static int parse_long_options(char * const *, const char *,
-			      const struct option *, int *, int);
+static int getopt_internal(int, char* const*, const char*,
+                           const struct option*, int*, int);
+static int parse_long_options(char* const*, const char*,
+                              const struct option*, int*, int);
 static int gcd(int, int);
-static void permute_args(int, int, int, char * const *);
+static void permute_args(int, int, int, char* const*);
 
-static char *place = EMSG; /* option letter processing */
+static char* place = EMSG; /* option letter processing */
 
 /* XXX: set optreset to 1 rather than these two */
 static int nonopt_start = -1; /* first non option argument (for permute) */
@@ -137,21 +137,21 @@ static const char illoptchar[] = "unknown option -- %c";
 static const char illoptstring[] = "unknown option -- %s";
 
 static void
-_vwarnx(const char *fmt,va_list ap)
+_vwarnx(const char* fmt, va_list ap)
 {
-  (void)fprintf(stderr,"%s: ",__progname);
-  if (fmt != NULL)
-    (void)vfprintf(stderr,fmt,ap);
-  (void)fprintf(stderr,"\n");
+    (void)fprintf(stderr, "%s: ", __progname);
+    if (fmt != NULL)
+        (void)vfprintf(stderr, fmt, ap);
+    (void)fprintf(stderr, "\n");
 }
 
 static void
-warnx(const char *fmt,...)
+warnx(const char* fmt, ...)
 {
-  va_list ap;
-  va_start(ap,fmt);
-  _vwarnx(fmt,ap);
-  va_end(ap);
+    va_list ap;
+    va_start(ap, fmt);
+    _vwarnx(fmt, ap);
+    va_end(ap);
 }
 
 /*
@@ -160,16 +160,17 @@ warnx(const char *fmt,...)
 static int
 gcd(int a, int b)
 {
-	int c;
+    int c;
 
-	c = a % b;
-	while (c != 0) {
-		a = b;
-		b = c;
-		c = a % b;
-	}
+    c = a % b;
+    while (c != 0)
+    {
+        a = b;
+        b = c;
+        c = a % b;
+    }
 
-	return (b);
+    return (b);
 }
 
 /*
@@ -179,34 +180,36 @@ gcd(int a, int b)
  */
 static void
 permute_args(int panonopt_start, int panonopt_end, int opt_end,
-	char * const *nargv)
+             char* const* nargv)
 {
-	int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
-	char *swap;
+    int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+    char* swap;
 
-	/*
+    /*
 	 * compute lengths of blocks and number and size of cycles
 	 */
-	nnonopts = panonopt_end - panonopt_start;
-	nopts = opt_end - panonopt_end;
-	ncycle = gcd(nnonopts, nopts);
-	cyclelen = (opt_end - panonopt_start) / ncycle;
-
-	for (i = 0; i < ncycle; i++) {
-		cstart = panonopt_end+i;
-		pos = cstart;
-		for (j = 0; j < cyclelen; j++) {
-			if (pos >= panonopt_end)
-				pos -= nnonopts;
-			else
-				pos += nopts;
-			swap = nargv[pos];
-			/* LINTED const cast */
-			((char **) nargv)[pos] = nargv[cstart];
-			/* LINTED const cast */
-			((char **)nargv)[cstart] = swap;
-		}
-	}
+    nnonopts = panonopt_end - panonopt_start;
+    nopts = opt_end - panonopt_end;
+    ncycle = gcd(nnonopts, nopts);
+    cyclelen = (opt_end - panonopt_start) / ncycle;
+
+    for (i = 0; i < ncycle; i++)
+    {
+        cstart = panonopt_end + i;
+        pos = cstart;
+        for (j = 0; j < cyclelen; j++)
+        {
+            if (pos >= panonopt_end)
+                pos -= nnonopts;
+            else
+                pos += nopts;
+            swap = nargv[pos];
+            /* LINTED const cast */
+            ((char**)nargv)[pos] = nargv[cstart];
+            /* LINTED const cast */
+            ((char**)nargv)[cstart] = swap;
+        }
+    }
 }
 
 #ifdef REPLACE_GETOPT
@@ -216,11 +219,9 @@ permute_args(int panonopt_start, int panonopt_end, int opt_end,
  *
  * [eventually this will replace the BSD getopt]
  */
-int
-getopt(int nargc, char * const *nargv, const char *options)
+int getopt(int nargc, char* const* nargv, const char* options)
 {
-
-	/*
+    /*
 	 * We don't pass FLAG_PERMUTE to getopt_internal() since
 	 * the BSD getopt(3) (unlike GNU) has never done this.
 	 *
@@ -228,7 +229,7 @@ getopt(int nargc, char * const *nargv, const char *options)
 	 * before dropping privileges it makes sense to keep things
 	 * as simple (and bug-free) as possible.
 	 */
-	return (getopt_internal(nargc, nargv, options, NULL, NULL, 0));
+    return (getopt_internal(nargc, nargv, options, NULL, NULL, 0));
 }
 #endif /* REPLACE_GETOPT */
 
@@ -241,7 +242,7 @@ getopt(int nargc, char * const *nargv, const char *options)
  * proclaim their BSD heritage, before including this header; however,
  * to maintain portability, developers are advised to avoid it.
  */
-# define optreset  __mingw_optreset
+#define optreset __mingw_optreset
 extern int optreset;
 #endif
 #ifdef __cplusplus
@@ -265,19 +266,19 @@ extern int optreset;
 extern "C" {
 #endif
 
-struct option		/* specification for a long form option...	*/
+struct option /* specification for a long form option...	*/
 {
-  const char *name;		/* option name, without leading hyphens */
-  int         has_arg;		/* does it take an argument?		*/
-  int        *flag;		/* where to save its status, or NULL	*/
-  int         val;		/* its associated status value		*/
+    const char* name; /* option name, without leading hyphens */
+    int has_arg;      /* does it take an argument?		*/
+    int* flag;        /* where to save its status, or NULL	*/
+    int val;          /* its associated status value		*/
 };
 
-enum    		/* permitted values for its `has_arg' field...	*/
+enum /* permitted values for its `has_arg' field...	*/
 {
-  no_argument = 0,      	/* option never takes an argument	*/
-  required_argument,		/* option always requires an argument	*/
-  optional_argument		/* option may take an argument		*/
+    no_argument = 0,   /* option never takes an argument	*/
+    required_argument, /* option always requires an argument	*/
+    optional_argument  /* option may take an argument		*/
 };
 
 /*
@@ -286,126 +287,137 @@ enum    		/* permitted values for its `has_arg' field...	*/
  * Returns -1 if short_too is set and the option does not match long_options.
  */
 static int
-parse_long_options(char * const *nargv, const char *options,
-	const struct option *long_options, int *idx, int short_too)
+parse_long_options(char* const* nargv, const char* options,
+                   const struct option* long_options, int* idx, int short_too)
 {
-	char *current_argv, *has_equal;
-	size_t current_argv_len;
-	int i, ambiguous, match;
-
-#define IDENTICAL_INTERPRETATION(_x, _y)                                \
-	(long_options[(_x)].has_arg == long_options[(_y)].has_arg &&    \
-	 long_options[(_x)].flag == long_options[(_y)].flag &&          \
-	 long_options[(_x)].val == long_options[(_y)].val)
-
-	current_argv = place;
-	match = -1;
-	ambiguous = 0;
-
-	optind++;
-
-	if ((has_equal = strchr(current_argv, '=')) != NULL) {
-		/* argument found (--option=arg) */
-		current_argv_len = has_equal - current_argv;
-		has_equal++;
-	} else
-		current_argv_len = strlen(current_argv);
-
-	for (i = 0; long_options[i].name; i++) {
-		/* find matching long option */
-		if (strncmp(current_argv, long_options[i].name,
-		    current_argv_len))
-			continue;
-
-		if (strlen(long_options[i].name) == current_argv_len) {
-			/* exact match */
-			match = i;
-			ambiguous = 0;
-			break;
-		}
-		/*
+    char *current_argv, *has_equal;
+    size_t current_argv_len;
+    int i, ambiguous, match;
+
+#define IDENTICAL_INTERPRETATION(_x, _y) \
+    (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag && long_options[(_x)].val == long_options[(_y)].val)
+
+    current_argv = place;
+    match = -1;
+    ambiguous = 0;
+
+    optind++;
+
+    if ((has_equal = strchr(current_argv, '=')) != NULL)
+    {
+        /* argument found (--option=arg) */
+        current_argv_len = has_equal - current_argv;
+        has_equal++;
+    }
+    else
+        current_argv_len = strlen(current_argv);
+
+    for (i = 0; long_options[i].name; i++)
+    {
+        /* find matching long option */
+        if (strncmp(current_argv, long_options[i].name,
+                    current_argv_len))
+            continue;
+
+        if (strlen(long_options[i].name) == current_argv_len)
+        {
+            /* exact match */
+            match = i;
+            ambiguous = 0;
+            break;
+        }
+        /*
 		 * If this is a known short option, don't allow
 		 * a partial match of a single character.
 		 */
-		if (short_too && current_argv_len == 1)
-			continue;
-
-		if (match == -1)	/* partial match */
-			match = i;
-		else if (!IDENTICAL_INTERPRETATION(i, match))
-			ambiguous = 1;
-	}
-	if (ambiguous) {
-		/* ambiguous abbreviation */
-		if (PRINT_ERROR)
-			warnx(ambig, (int)current_argv_len,
-			     current_argv);
-		optopt = 0;
-		return (BADCH);
-	}
-	if (match != -1) {		/* option found */
-		if (long_options[match].has_arg == no_argument
-		    && has_equal) {
-			if (PRINT_ERROR)
-				warnx(noarg, (int)current_argv_len,
-				     current_argv);
-			/*
+        if (short_too && current_argv_len == 1)
+            continue;
+
+        if (match == -1) /* partial match */
+            match = i;
+        else if (!IDENTICAL_INTERPRETATION(i, match))
+            ambiguous = 1;
+    }
+    if (ambiguous)
+    {
+        /* ambiguous abbreviation */
+        if (PRINT_ERROR)
+            warnx(ambig, (int)current_argv_len,
+                  current_argv);
+        optopt = 0;
+        return (BADCH);
+    }
+    if (match != -1)
+    { /* option found */
+        if (long_options[match].has_arg == no_argument
+            && has_equal)
+        {
+            if (PRINT_ERROR)
+                warnx(noarg, (int)current_argv_len,
+                      current_argv);
+            /*
 			 * XXX: GNU sets optopt to val regardless of flag
 			 */
-			if (long_options[match].flag == NULL)
-				optopt = long_options[match].val;
-			else
-				optopt = 0;
-			return (BADARG);
-		}
-		if (long_options[match].has_arg == required_argument ||
-		    long_options[match].has_arg == optional_argument) {
-			if (has_equal)
-				optarg = has_equal;
-			else if (long_options[match].has_arg ==
-			    required_argument) {
-				/*
+            if (long_options[match].flag == NULL)
+                optopt = long_options[match].val;
+            else
+                optopt = 0;
+            return (BADARG);
+        }
+        if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument)
+        {
+            if (has_equal)
+                optarg = has_equal;
+            else if (long_options[match].has_arg == required_argument)
+            {
+                /*
 				 * optional argument doesn't use next nargv
 				 */
-				optarg = nargv[optind++];
-			}
-		}
-		if ((long_options[match].has_arg == required_argument)
-		    && (optarg == NULL)) {
-			/*
+                optarg = nargv[optind++];
+            }
+        }
+        if ((long_options[match].has_arg == required_argument)
+            && (optarg == NULL))
+        {
+            /*
 			 * Missing argument; leading ':' indicates no error
 			 * should be generated.
 			 */
-			if (PRINT_ERROR)
-				warnx(recargstring,
-				    current_argv);
-			/*
+            if (PRINT_ERROR)
+                warnx(recargstring,
+                      current_argv);
+            /*
 			 * XXX: GNU sets optopt to val regardless of flag
 			 */
-			if (long_options[match].flag == NULL)
-				optopt = long_options[match].val;
-			else
-				optopt = 0;
-			--optind;
-			return (BADARG);
-		}
-	} else {			/* unknown option */
-		if (short_too) {
-			--optind;
-			return (-1);
-		}
-		if (PRINT_ERROR)
-			warnx(illoptstring, current_argv);
-		optopt = 0;
-		return (BADCH);
-	}
-	if (idx)
-		*idx = match;
-	if (long_options[match].flag) {
-		*long_options[match].flag = long_options[match].val;
-		return (0);
-	} else
-		return (long_options[match].val);
+            if (long_options[match].flag == NULL)
+                optopt = long_options[match].val;
+            else
+                optopt = 0;
+            --optind;
+            return (BADARG);
+        }
+    }
+    else
+    { /* unknown option */
+        if (short_too)
+        {
+            --optind;
+            return (-1);
+        }
+        if (PRINT_ERROR)
+            warnx(illoptstring, current_argv);
+        optopt = 0;
+        return (BADCH);
+    }
+    if (idx)
+        *idx = match;
+    if (long_options[match].flag)
+    {
+        *long_options[match].flag = long_options[match].val;
+        return (0);
+    }
+    else
+        return (long_options[match].val);
 #undef IDENTICAL_INTERPRETATION
 }
 
@@ -414,222 +426,235 @@ parse_long_options(char * const *nargv, const char *options,
  *	Parse argc/argv argument vector.  Called by user level routines.
  */
 static int
-getopt_internal(int nargc, char * const *nargv, const char *options,
-	const struct option *long_options, int *idx, int flags)
+getopt_internal(int nargc, char* const* nargv, const char* options,
+                const struct option* long_options, int* idx, int flags)
 {
-	char *oli;				/* option letter list index */
-	int optchar, short_too;
-	static int posixly_correct = -1;
+    char* oli; /* option letter list index */
+    int optchar, short_too;
+    static int posixly_correct = -1;
 
-	if (options == NULL)
-		return (-1);
+    if (options == NULL)
+        return (-1);
 
-	/*
+    /*
 	 * XXX Some GNU programs (like cvs) set optind to 0 instead of
 	 * XXX using optreset.  Work around this braindamage.
 	 */
-	if (optind == 0)
-		optind = optreset = 1;
+    if (optind == 0)
+        optind = optreset = 1;
 
-	/*
+    /*
 	 * Disable GNU extensions if POSIXLY_CORRECT is set or options
 	 * string begins with a '+'.
 	 *
 	 * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or
 	 *                 optreset != 0 for GNU compatibility.
 	 */
-	if (posixly_correct == -1 || optreset != 0)
-		posixly_correct = (getenv("POSIXLY_CORRECT") != NULL);
-	if (*options == '-')
-		flags |= FLAG_ALLARGS;
-	else if (posixly_correct || *options == '+')
-		flags &= ~FLAG_PERMUTE;
-	if (*options == '+' || *options == '-')
-		options++;
-
-	optarg = NULL;
-	if (optreset)
-		nonopt_start = nonopt_end = -1;
+    if (posixly_correct == -1 || optreset != 0)
+        posixly_correct = (getenv("POSIXLY_CORRECT") != NULL);
+    if (*options == '-')
+        flags |= FLAG_ALLARGS;
+    else if (posixly_correct || *options == '+')
+        flags &= ~FLAG_PERMUTE;
+    if (*options == '+' || *options == '-')
+        options++;
+
+    optarg = NULL;
+    if (optreset)
+        nonopt_start = nonopt_end = -1;
 start:
-	if (optreset || !*place) {		/* update scanning pointer */
-		optreset = 0;
-		if (optind >= nargc) {          /* end of argument vector */
-			place = EMSG;
-			if (nonopt_end != -1) {
-				/* do permutation, if we have to */
-				permute_args(nonopt_start, nonopt_end,
-				    optind, nargv);
-				optind -= nonopt_end - nonopt_start;
-			}
-			else if (nonopt_start != -1) {
-				/*
+    if (optreset || !*place)
+    { /* update scanning pointer */
+        optreset = 0;
+        if (optind >= nargc)
+        { /* end of argument vector */
+            place = EMSG;
+            if (nonopt_end != -1)
+            {
+                /* do permutation, if we have to */
+                permute_args(nonopt_start, nonopt_end,
+                             optind, nargv);
+                optind -= nonopt_end - nonopt_start;
+            }
+            else if (nonopt_start != -1)
+            {
+                /*
 				 * If we skipped non-options, set optind
 				 * to the first of them.
 				 */
-				optind = nonopt_start;
-			}
-			nonopt_start = nonopt_end = -1;
-			return (-1);
-		}
-		if (*(place = nargv[optind]) != '-' ||
-		    (place[1] == '\0' && strchr(options, '-') == NULL)) {
-			place = EMSG;		/* found non-option */
-			if (flags & FLAG_ALLARGS) {
-				/*
+                optind = nonopt_start;
+            }
+            nonopt_start = nonopt_end = -1;
+            return (-1);
+        }
+        if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL))
+        {
+            place = EMSG; /* found non-option */
+            if (flags & FLAG_ALLARGS)
+            {
+                /*
 				 * GNU extension:
 				 * return non-option as argument to option 1
 				 */
-				optarg = nargv[optind++];
-				return (INORDER);
-			}
-			if (!(flags & FLAG_PERMUTE)) {
-				/*
+                optarg = nargv[optind++];
+                return (INORDER);
+            }
+            if (!(flags & FLAG_PERMUTE))
+            {
+                /*
 				 * If no permutation wanted, stop parsing
 				 * at first non-option.
 				 */
-				return (-1);
-			}
-			/* do permutation */
-			if (nonopt_start == -1)
-				nonopt_start = optind;
-			else if (nonopt_end != -1) {
-				permute_args(nonopt_start, nonopt_end,
-				    optind, nargv);
-				nonopt_start = optind -
-				    (nonopt_end - nonopt_start);
-				nonopt_end = -1;
-			}
-			optind++;
-			/* process next argument */
-			goto start;
-		}
-		if (nonopt_start != -1 && nonopt_end == -1)
-			nonopt_end = optind;
-
-		/*
+                return (-1);
+            }
+            /* do permutation */
+            if (nonopt_start == -1)
+                nonopt_start = optind;
+            else if (nonopt_end != -1)
+            {
+                permute_args(nonopt_start, nonopt_end,
+                             optind, nargv);
+                nonopt_start = optind - (nonopt_end - nonopt_start);
+                nonopt_end = -1;
+            }
+            optind++;
+            /* process next argument */
+            goto start;
+        }
+        if (nonopt_start != -1 && nonopt_end == -1)
+            nonopt_end = optind;
+
+        /*
 		 * If we have "-" do nothing, if "--" we are done.
 		 */
-		if (place[1] != '\0' && *++place == '-' && place[1] == '\0') {
-			optind++;
-			place = EMSG;
-			/*
+        if (place[1] != '\0' && *++place == '-' && place[1] == '\0')
+        {
+            optind++;
+            place = EMSG;
+            /*
 			 * We found an option (--), so if we skipped
 			 * non-options, we have to permute.
 			 */
-			if (nonopt_end != -1) {
-				permute_args(nonopt_start, nonopt_end,
-				    optind, nargv);
-				optind -= nonopt_end - nonopt_start;
-			}
-			nonopt_start = nonopt_end = -1;
-			return (-1);
-		}
-	}
-
-	/*
+            if (nonopt_end != -1)
+            {
+                permute_args(nonopt_start, nonopt_end,
+                             optind, nargv);
+                optind -= nonopt_end - nonopt_start;
+            }
+            nonopt_start = nonopt_end = -1;
+            return (-1);
+        }
+    }
+
+    /*
 	 * Check long options if:
 	 *  1) we were passed some
 	 *  2) the arg is not just "-"
 	 *  3) either the arg starts with -- we are getopt_long_only()
 	 */
-	if (long_options != NULL && place != nargv[optind] &&
-	    (*place == '-' || (flags & FLAG_LONGONLY))) {
-		short_too = 0;
-		if (*place == '-')
-			place++;		/* --foo long option */
-		else if (*place != ':' && strchr(options, *place) != NULL)
-			short_too = 1;		/* could be short option too */
-
-		optchar = parse_long_options(nargv, options, long_options,
-		    idx, short_too);
-		if (optchar != -1) {
-			place = EMSG;
-			return (optchar);
-		}
-	}
-
-	if ((optchar = (int)*place++) == (int)':' ||
-	    (optchar == (int)'-' && *place != '\0') ||
-	    (oli = (char*)strchr(options, optchar)) == NULL) {
-		/*
+    if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY)))
+    {
+        short_too = 0;
+        if (*place == '-')
+            place++; /* --foo long option */
+        else if (*place != ':' && strchr(options, *place) != NULL)
+            short_too = 1; /* could be short option too */
+
+        optchar = parse_long_options(nargv, options, long_options,
+                                     idx, short_too);
+        if (optchar != -1)
+        {
+            place = EMSG;
+            return (optchar);
+        }
+    }
+
+    if ((optchar = (int)*place++) == (int)':' || (optchar == (int)'-' && *place != '\0') || (oli = (char*)strchr(options, optchar)) == NULL)
+    {
+        /*
 		 * If the user specified "-" and  '-' isn't listed in
 		 * options, return -1 (non-option) as per POSIX.
 		 * Otherwise, it is an unknown option character (or ':').
 		 */
-		if (optchar == (int)'-' && *place == '\0')
-			return (-1);
-		if (!*place)
-			++optind;
-		if (PRINT_ERROR)
-			warnx(illoptchar, optchar);
-		optopt = optchar;
-		return (BADCH);
-	}
-	if (long_options != NULL && optchar == 'W' && oli[1] == ';') {
-		/* -W long-option */
-		if (*place)			/* no space */
-			/* NOTHING */;
-		else if (++optind >= nargc) {	/* no arg */
-			place = EMSG;
-			if (PRINT_ERROR)
-				warnx(recargchar, optchar);
-			optopt = optchar;
-			return (BADARG);
-		} else				/* white space */
-			place = nargv[optind];
-		optchar = parse_long_options(nargv, options, long_options,
-		    idx, 0);
-		place = EMSG;
-		return (optchar);
-	}
-	if (*++oli != ':') {			/* doesn't take argument */
-		if (!*place)
-			++optind;
-	} else {				/* takes (optional) argument */
-		optarg = NULL;
-		if (*place)			/* no white space */
-			optarg = place;
-		else if (oli[1] != ':') {	/* arg not optional */
-			if (++optind >= nargc) {	/* no arg */
-				place = EMSG;
-				if (PRINT_ERROR)
-					warnx(recargchar, optchar);
-				optopt = optchar;
-				return (BADARG);
-			} else
-				optarg = nargv[optind];
-		}
-		place = EMSG;
-		++optind;
-	}
-	/* dump back option letter */
-	return (optchar);
+        if (optchar == (int)'-' && *place == '\0')
+            return (-1);
+        if (!*place)
+            ++optind;
+        if (PRINT_ERROR)
+            warnx(illoptchar, optchar);
+        optopt = optchar;
+        return (BADCH);
+    }
+    if (long_options != NULL && optchar == 'W' && oli[1] == ';')
+    {
+        /* -W long-option */
+        if (*place) /* no space */
+            /* NOTHING */;
+        else if (++optind >= nargc)
+        { /* no arg */
+            place = EMSG;
+            if (PRINT_ERROR)
+                warnx(recargchar, optchar);
+            optopt = optchar;
+            return (BADARG);
+        }
+        else /* white space */
+            place = nargv[optind];
+        optchar = parse_long_options(nargv, options, long_options,
+                                     idx, 0);
+        place = EMSG;
+        return (optchar);
+    }
+    if (*++oli != ':')
+    { /* doesn't take argument */
+        if (!*place)
+            ++optind;
+    }
+    else
+    { /* takes (optional) argument */
+        optarg = NULL;
+        if (*place) /* no white space */
+            optarg = place;
+        else if (oli[1] != ':')
+        { /* arg not optional */
+            if (++optind >= nargc)
+            { /* no arg */
+                place = EMSG;
+                if (PRINT_ERROR)
+                    warnx(recargchar, optchar);
+                optopt = optchar;
+                return (BADARG);
+            }
+            else
+                optarg = nargv[optind];
+        }
+        place = EMSG;
+        ++optind;
+    }
+    /* dump back option letter */
+    return (optchar);
 }
 
 /*
  * getopt_long --
  *	Parse argc/argv argument vector.
  */
-int
-getopt_long(int nargc, char * const *nargv, const char *options,
-    const struct option *long_options, int *idx)
+int getopt_long(int nargc, char* const* nargv, const char* options,
+                const struct option* long_options, int* idx)
 {
-
-	return (getopt_internal(nargc, nargv, options, long_options, idx,
-	    FLAG_PERMUTE));
+    return (getopt_internal(nargc, nargv, options, long_options, idx,
+                            FLAG_PERMUTE));
 }
 
 /*
  * getopt_long_only --
  *	Parse argc/argv argument vector.
  */
-int
-getopt_long_only(int nargc, char * const *nargv, const char *options,
-    const struct option *long_options, int *idx)
+int getopt_long_only(int nargc, char* const* nargv, const char* options,
+                     const struct option* long_options, int* idx)
 {
-
-	return (getopt_internal(nargc, nargv, options, long_options, idx,
-	    FLAG_PERMUTE|FLAG_LONGONLY));
+    return (getopt_internal(nargc, nargv, options, long_options, idx,
+                            FLAG_PERMUTE | FLAG_LONGONLY));
 }
 
 //extern int getopt_long(int nargc, char * const *nargv, const char *options,
@@ -643,7 +668,7 @@ getopt_long_only(int nargc, char * const *nargv, const char *options,
 /*
  * ...for the long form API only; keep this for compatibility.
  */
-# define HAVE_DECL_GETOPT	1
+#define HAVE_DECL_GETOPT 1
 #endif
 
 #ifdef __cplusplus
diff --git a/examples/common/stb_image.h b/examples/common/stb_image.h
index aa445aadf..142610cf4 100644
--- a/examples/common/stb_image.h
+++ b/examples/common/stb_image.h
@@ -3,13 +3,13 @@
 
 #ifndef STBI_NO_STDIO
 #include <stdio.h>
-#endif    // STBI_NO_STDIO
+#endif // STBI_NO_STDIO
 
 #define STBI_VERSION 1
 
 enum
 {
-    STBI_default = 0,    // only used for desired_channels
+    STBI_default = 0, // only used for desired_channels
 
     STBI_grey = 1,
     STBI_grey_alpha = 2,
@@ -36,9 +36,9 @@ extern "C" {
 typedef struct
 {
     int (*read)(void* user, char* data,
-                int size);    // fill 'data' with 'size' bytes.  return number of bytes actually read
-    void (*skip)(void* user, int n);    // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
-    int (*eof)(void* user);    // returns nonzero if we are at end of file/data
+                int size);           // fill 'data' with 'size' bytes.  return number of bytes actually read
+    void (*skip)(void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+    int (*eof)(void* user);          // returns nonzero if we are at end of file/data
 } stbi_io_callbacks;
 
 ////////////////////////////////////
@@ -95,12 +95,12 @@ extern float* stbi_loadf_from_file(FILE* f, int* x, int* y, int* channels_in_fil
 #ifndef STBI_NO_HDR
 extern void stbi_hdr_to_ldr_gamma(float gamma);
 extern void stbi_hdr_to_ldr_scale(float scale);
-#endif    // STBI_NO_HDR
+#endif // STBI_NO_HDR
 
 #ifndef STBI_NO_LINEAR
 extern void stbi_ldr_to_hdr_gamma(float gamma);
 extern void stbi_ldr_to_hdr_scale(float scale);
-#endif    // STBI_NO_LINEAR
+#endif // STBI_NO_LINEAR
 
 // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
 extern int stbi_is_hdr_from_callbacks(stbi_io_callbacks const* clbk, void* user);
@@ -108,7 +108,7 @@ extern int stbi_is_hdr_from_memory(stbi_uc const* buffer, int len);
 #ifndef STBI_NO_STDIO
 extern int stbi_is_hdr(char const* filename);
 extern int stbi_is_hdr_from_file(FILE* f);
-#endif    // STBI_NO_STDIO
+#endif // STBI_NO_STDIO
 
 // get a VERY brief reason for failure
 // NOT THREADSAFE
@@ -160,14 +160,12 @@ extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char*
 //
 //
 ////   end header file   /////////////////////////////////////////////////////
-#endif    // STBI_INCLUDE_STB_IMAGE_H
+#endif // STBI_INCLUDE_STB_IMAGE_H
 
 #define STB_IMAGE_IMPLEMENTATION
 #ifdef STB_IMAGE_IMPLEMENTATION
 
-#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || \
-    defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) ||  \
-    defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB)
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB)
 #ifndef STBI_ONLY_JPEG
 #define STBI_NO_JPEG
 #endif
@@ -202,13 +200,13 @@ extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char*
 #endif
 
 #include <stdarg.h>
-#include <stddef.h>    // ptrdiff_t on osx
+#include <stddef.h> // ptrdiff_t on osx
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>    // ldexp, pow
+#include <math.h> // ldexp, pow
 #endif
 
 #ifndef STBI_NO_STDIO
@@ -247,9 +245,9 @@ typedef int32_t stbi__int32;
 typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 
 #ifdef _MSC_VER
-#define STBI_NOTUSED(v) ( void )(v)
+#define STBI_NOTUSED(v) (void)(v)
 #else
-#define STBI_NOTUSED(v) ( void )sizeof(v)
+#define STBI_NOTUSED(v) (void)sizeof(v)
 #endif
 
 #ifdef _MSC_VER
@@ -271,9 +269,9 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 #endif
 
 #ifndef STBI_MALLOC
-#define STBI_MALLOC(sz) malloc(sz)
+#define STBI_MALLOC(sz)        malloc(sz)
 #define STBI_REALLOC(p, newsz) realloc(p, newsz)
-#define STBI_FREE(p) free(p)
+#define STBI_FREE(p)           free(p)
 #endif
 
 #ifndef STBI_REALLOC_SIZED
@@ -319,8 +317,8 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 
 #ifdef _MSC_VER
 
-#if _MSC_VER >= 1400    // not VC6
-#include <intrin.h>    // __cpuid
+#if _MSC_VER >= 1400 // not VC6
+#include <intrin.h>  // __cpuid
 static int stbi__cpuid3(void)
 {
     int info[4];
@@ -347,7 +345,7 @@ static int stbi__sse2_available(void)
     int info3 = stbi__cpuid3();
     return ((info3 >> 26) & 1) != 0;
 }
-#else    // assume GCC-style if not VC++
+#else // assume GCC-style if not VC++
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 
 static int stbi__sse2_available(void)
@@ -404,8 +402,8 @@ static void stbi__start_mem(stbi__context* s, stbi_uc const* buffer, int len)
 {
     s->io.read = NULL;
     s->read_from_callbacks = 0;
-    s->img_buffer = s->img_buffer_original = ( stbi_uc* )buffer;
-    s->img_buffer_end = s->img_buffer_original_end = ( stbi_uc* )buffer + len;
+    s->img_buffer = s->img_buffer_original = (stbi_uc*)buffer;
+    s->img_buffer_end = s->img_buffer_original_end = (stbi_uc*)buffer + len;
 }
 
 // initialize a callback-based context
@@ -424,17 +422,17 @@ static void stbi__start_callbacks(stbi__context* s, stbi_io_callbacks* c, void*
 
 static int stbi__stdio_read(void* user, char* data, int size)
 {
-    return ( int )fread(data, 1, size, ( FILE* )user);
+    return (int)fread(data, 1, size, (FILE*)user);
 }
 
 static void stbi__stdio_skip(void* user, int n)
 {
-    fseek(( FILE* )user, n, SEEK_CUR);
+    fseek((FILE*)user, n, SEEK_CUR);
 }
 
 static int stbi__stdio_eof(void* user)
 {
-    return feof(( FILE* )user);
+    return feof((FILE*)user);
 }
 
 static stbi_io_callbacks stbi__stdio_callbacks = {
@@ -445,12 +443,12 @@ static stbi_io_callbacks stbi__stdio_callbacks = {
 
 static void stbi__start_file(stbi__context* s, FILE* f)
 {
-    stbi__start_callbacks(s, &stbi__stdio_callbacks, ( void* )f);
+    stbi__start_callbacks(s, &stbi__stdio_callbacks, (void*)f);
 }
 
 // static void stop_file(stbi__context *s) { }
 
-#endif    // !STBI_NO_STDIO
+#endif // !STBI_NO_STDIO
 
 static void stbi__rewind(stbi__context* s)
 {
@@ -564,7 +562,7 @@ static void* stbi__malloc(size_t size)
 // negative terms are considered invalid.
 static int stbi__addsizes_valid(int a, int b)
 {
-    if(b < 0)
+    if (b < 0)
         return 0;
     // now 0 <= b <= INT_MAX, hence also
     // 0 <= INT_MAX - b <= INTMAX.
@@ -577,10 +575,10 @@ static int stbi__addsizes_valid(int a, int b)
 // negative factors are considered invalid.
 static int stbi__mul2sizes_valid(int a, int b)
 {
-    if(a < 0 || b < 0)
+    if (a < 0 || b < 0)
         return 0;
-    if(b == 0)
-        return 1;    // mul-by-0 is always safe
+    if (b == 0)
+        return 1; // mul-by-0 is always safe
     // portable way to check for no overflows in a*b
     return a <= INT_MAX / b;
 }
@@ -601,22 +599,21 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add)
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
 static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
 {
-    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) &&
-           stbi__addsizes_valid(a * b * c * d, add);
+    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) && stbi__addsizes_valid(a * b * c * d, add);
 }
 #endif
 
 // mallocs with size overflow checking
 static void* stbi__malloc_mad2(int a, int b, int add)
 {
-    if(!stbi__mad2sizes_valid(a, b, add))
+    if (!stbi__mad2sizes_valid(a, b, add))
         return NULL;
     return stbi__malloc(a * b + add);
 }
 
 static void* stbi__malloc_mad3(int a, int b, int c, int add)
 {
-    if(!stbi__mad3sizes_valid(a, b, c, add))
+    if (!stbi__mad3sizes_valid(a, b, c, add))
         return NULL;
     return stbi__malloc(a * b * c + add);
 }
@@ -624,7 +621,7 @@ static void* stbi__malloc_mad3(int a, int b, int c, int add)
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
 static void* stbi__malloc_mad4(int a, int b, int c, int d, int add)
 {
-    if(!stbi__mad4sizes_valid(a, b, c, d, add))
+    if (!stbi__mad4sizes_valid(a, b, c, d, add))
         return NULL;
     return stbi__malloc(a * b * c * d + add);
 }
@@ -642,8 +639,8 @@ static void* stbi__malloc_mad4(int a, int b, int c, int d, int add)
 #define stbi__err(x, y) stbi__err(x)
 #endif
 
-#define stbi__errpf(x, y) (( float* )(size_t)(stbi__err(x, y) ? NULL : NULL))
-#define stbi__errpuc(x, y) (( unsigned char* )(size_t)(stbi__err(x, y) ? NULL : NULL))
+#define stbi__errpf(x, y)  ((float*)(size_t)(stbi__err(x, y) ? NULL : NULL))
+#define stbi__errpuc(x, y) ((unsigned char*)(size_t)(stbi__err(x, y) ? NULL : NULL))
 
 extern void stbi_image_free(void* retval_from_stbi_load)
 {
@@ -667,43 +664,42 @@ extern void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
 
 static void* stbi__load_main(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri, int bpc)
 {
-    memset(ri, 0, sizeof(*ri));    // make sure it's initialized if we add new fields
-    ri->bits_per_channel = 8;    // default is 8 so most paths don't have to be changed
-    ri->channel_order =
-        STBI_ORDER_RGB;    // all current input & output are this, but this is here so we can add BGR order
+    memset(ri, 0, sizeof(*ri));         // make sure it's initialized if we add new fields
+    ri->bits_per_channel = 8;           // default is 8 so most paths don't have to be changed
+    ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
     ri->num_channels = 0;
 
 #ifndef STBI_NO_JPEG
-    if(stbi__jpeg_test(s))
+    if (stbi__jpeg_test(s))
         return stbi__jpeg_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_PNG
-    if(stbi__png_test(s))
+    if (stbi__png_test(s))
         return stbi__png_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_BMP
-    if(stbi__bmp_test(s))
+    if (stbi__bmp_test(s))
         return stbi__bmp_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_GIF
-    if(stbi__gif_test(s))
+    if (stbi__gif_test(s))
         return stbi__gif_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_PSD
-    if(stbi__psd_test(s))
+    if (stbi__psd_test(s))
         return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc);
 #endif
 #ifndef STBI_NO_PIC
-    if(stbi__pic_test(s))
+    if (stbi__pic_test(s))
         return stbi__pic_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_PNM
-    if(stbi__pnm_test(s))
+    if (stbi__pnm_test(s))
         return stbi__pnm_load(s, x, y, comp, req_comp, ri);
 #endif
 
 #ifndef STBI_NO_HDR
-    if(stbi__hdr_test(s))
+    if (stbi__hdr_test(s))
     {
         float* hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri);
         return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
@@ -712,7 +708,7 @@ static void* stbi__load_main(stbi__context* s, int* x, int* y, int* comp, int re
 
 #ifndef STBI_NO_TGA
     // test tga last because it's a crappy test!
-    if(stbi__tga_test(s))
+    if (stbi__tga_test(s))
         return stbi__tga_load(s, x, y, comp, req_comp, ri);
 #endif
 
@@ -725,13 +721,12 @@ static stbi_uc* stbi__convert_16_to_8(stbi__uint16* orig, int w, int h, int chan
     int img_len = w * h * channels;
     stbi_uc* reduced;
 
-    reduced = ( stbi_uc* )stbi__malloc(img_len);
-    if(reduced == NULL)
+    reduced = (stbi_uc*)stbi__malloc(img_len);
+    if (reduced == NULL)
         return stbi__errpuc("outofmem", "Out of memory");
 
-    for(i = 0; i < img_len; ++i)
-        reduced[i] =
-            (stbi_uc)((orig[i] >> 8) & 0xFF);    // top half of each byte is sufficient approx of 16->8 bit scaling
+    for (i = 0; i < img_len; ++i)
+        reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
 
     STBI_FREE(orig);
     return reduced;
@@ -743,13 +738,12 @@ static stbi__uint16* stbi__convert_8_to_16(stbi_uc* orig, int w, int h, int chan
     int img_len = w * h * channels;
     stbi__uint16* enlarged;
 
-    enlarged = ( stbi__uint16* )stbi__malloc(img_len * 2);
-    if(enlarged == NULL)
-        return ( stbi__uint16* )stbi__errpuc("outofmem", "Out of memory");
+    enlarged = (stbi__uint16*)stbi__malloc(img_len * 2);
+    if (enlarged == NULL)
+        return (stbi__uint16*)stbi__errpuc("outofmem", "Out of memory");
 
-    for(i = 0; i < img_len; ++i)
-        enlarged[i] =
-            (stbi__uint16)((orig[i] << 8) + orig[i]);    // replicate to high and low byte, maps 0->0, 255->0xffff
+    for (i = 0; i < img_len; ++i)
+        enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
 
     STBI_FREE(orig);
     return enlarged;
@@ -758,17 +752,17 @@ static stbi__uint16* stbi__convert_8_to_16(stbi_uc* orig, int w, int h, int chan
 static void stbi__vertical_flip(void* image, int w, int h, int bytes_per_pixel)
 {
     int row;
-    size_t bytes_per_row = ( size_t )w * bytes_per_pixel;
+    size_t bytes_per_row = (size_t)w * bytes_per_pixel;
     stbi_uc temp[2048];
-    stbi_uc* bytes = ( stbi_uc* )image;
+    stbi_uc* bytes = (stbi_uc*)image;
 
-    for(row = 0; row < (h >> 1); row++)
+    for (row = 0; row < (h >> 1); row++)
     {
         stbi_uc* row0 = bytes + row * bytes_per_row;
         stbi_uc* row1 = bytes + (h - row - 1) * bytes_per_row;
         // swap row0 with row1
         size_t bytes_left = bytes_per_row;
-        while(bytes_left)
+        while (bytes_left)
         {
             size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
             memcpy(temp, row0, bytes_copy);
@@ -786,8 +780,8 @@ static void stbi__vertical_flip_slices(void* image, int w, int h, int z, int byt
     int slice;
     int slice_size = w * h * bytes_per_pixel;
 
-    stbi_uc* bytes = ( stbi_uc* )image;
-    for(slice = 0; slice < z; ++slice)
+    stbi_uc* bytes = (stbi_uc*)image;
+    for (slice = 0; slice < z; ++slice)
     {
         stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
         bytes += slice_size;
@@ -799,25 +793,25 @@ static unsigned char* stbi__load_and_postprocess_8bit(stbi__context* s, int* x,
     stbi__result_info ri;
     void* result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
 
-    if(result == NULL)
+    if (result == NULL)
         return NULL;
 
-    if(ri.bits_per_channel != 8)
+    if (ri.bits_per_channel != 8)
     {
         STBI_ASSERT(ri.bits_per_channel == 16);
-        result = stbi__convert_16_to_8(( stbi__uint16* )result, *x, *y, req_comp == 0 ? *comp : req_comp);
+        result = stbi__convert_16_to_8((stbi__uint16*)result, *x, *y, req_comp == 0 ? *comp : req_comp);
         ri.bits_per_channel = 8;
     }
 
     // @TODO: move stbi__convert_format to here
 
-    if(stbi__vertically_flip_on_load)
+    if (stbi__vertically_flip_on_load)
     {
         int channels = req_comp ? req_comp : *comp;
         stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
     }
 
-    return ( unsigned char* )result;
+    return (unsigned char*)result;
 }
 
 static stbi__uint16* stbi__load_and_postprocess_16bit(stbi__context* s, int* x, int* y, int* comp, int req_comp)
@@ -825,32 +819,32 @@ static stbi__uint16* stbi__load_and_postprocess_16bit(stbi__context* s, int* x,
     stbi__result_info ri;
     void* result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
 
-    if(result == NULL)
+    if (result == NULL)
         return NULL;
 
-    if(ri.bits_per_channel != 16)
+    if (ri.bits_per_channel != 16)
     {
         STBI_ASSERT(ri.bits_per_channel == 8);
-        result = stbi__convert_8_to_16(( stbi_uc* )result, *x, *y, req_comp == 0 ? *comp : req_comp);
+        result = stbi__convert_8_to_16((stbi_uc*)result, *x, *y, req_comp == 0 ? *comp : req_comp);
         ri.bits_per_channel = 16;
     }
 
     // @TODO: move stbi__convert_format16 to here
     // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
 
-    if(stbi__vertically_flip_on_load)
+    if (stbi__vertically_flip_on_load)
     {
         int channels = req_comp ? req_comp : *comp;
         stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
     }
 
-    return ( stbi__uint16* )result;
+    return (stbi__uint16*)result;
 }
 
 #if !defined(STBI_NO_HDR) || !defined(STBI_NO_LINEAR)
 static void stbi__float_postprocess(float* result, int* x, int* y, int* comp, int req_comp)
 {
-    if(stbi__vertically_flip_on_load && result != NULL)
+    if (stbi__vertically_flip_on_load && result != NULL)
     {
         int channels = req_comp ? req_comp : *comp;
         stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
@@ -864,7 +858,7 @@ static FILE* stbi__fopen(char const* filename, char const* mode)
 {
     FILE* f;
 #if defined(_MSC_VER) && _MSC_VER >= 1400
-    if(0 != fopen_s(&f, filename, mode))
+    if (0 != fopen_s(&f, filename, mode))
         f = 0;
 #else
     f = fopen(filename, mode);
@@ -876,7 +870,7 @@ extern stbi_uc* stbi_load(const char* filename, int* x, int* y, int* comp, int r
 {
     FILE* f = stbi__fopen(filename, "rb");
     unsigned char* result;
-    if(!f)
+    if (!f)
         return stbi__errpuc("can't fopen", "Unable to open file");
     result = stbi_load_from_file(f, x, y, comp, req_comp);
     fclose(f);
@@ -889,10 +883,10 @@ extern stbi_uc* stbi_load_from_file(FILE* f, int* x, int* y, int* comp, int req_
     stbi__context s;
     stbi__start_file(&s, f);
     result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
-    if(result)
+    if (result)
     {
         // need to 'unget' all the characters in the IO buffer
-        fseek(f, -( int )(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+        fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
     }
     return result;
 }
@@ -903,10 +897,10 @@ extern stbi__uint16* stbi_load_from_file_16(FILE* f, int* x, int* y, int* comp,
     stbi__context s;
     stbi__start_file(&s, f);
     result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp);
-    if(result)
+    if (result)
     {
         // need to 'unget' all the characters in the IO buffer
-        fseek(f, -( int )(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+        fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
     }
     return result;
 }
@@ -915,14 +909,14 @@ extern stbi_us* stbi_load_16(char const* filename, int* x, int* y, int* comp, in
 {
     FILE* f = stbi__fopen(filename, "rb");
     stbi__uint16* result;
-    if(!f)
-        return ( stbi_us* )stbi__errpuc("can't fopen", "Unable to open file");
+    if (!f)
+        return (stbi_us*)stbi__errpuc("can't fopen", "Unable to open file");
     result = stbi_load_from_file_16(f, x, y, comp, req_comp);
     fclose(f);
     return result;
 }
 
-#endif    //! STBI_NO_STDIO
+#endif //! STBI_NO_STDIO
 
 extern stbi_us* stbi_load_16_from_memory(stbi_uc const* buffer, int len, int* x, int* y, int* channels_in_file,
                                          int desired_channels)
@@ -936,7 +930,7 @@ extern stbi_us* stbi_load_16_from_callbacks(stbi_io_callbacks const* clbk, void*
                                             int* channels_in_file, int desired_channels)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user);
     return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels);
 }
 
@@ -951,7 +945,7 @@ extern stbi_uc* stbi_load_from_callbacks(stbi_io_callbacks const* clbk, void* us
                                          int req_comp)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user);
     return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
 }
 
@@ -963,8 +957,8 @@ extern stbi_uc* stbi_load_gif_from_memory(stbi_uc const* buffer, int len, int**
     stbi__context s;
     stbi__start_mem(&s, buffer, len);
 
-    result = ( unsigned char* )stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
-    if(stbi__vertically_flip_on_load)
+    result = (unsigned char*)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+    if (stbi__vertically_flip_on_load)
     {
         stbi__vertical_flip_slices(result, *x, *y, *z, *comp);
     }
@@ -978,17 +972,17 @@ static float* stbi__loadf_main(stbi__context* s, int* x, int* y, int* comp, int
 {
     unsigned char* data;
 #ifndef STBI_NO_HDR
-    if(stbi__hdr_test(s))
+    if (stbi__hdr_test(s))
     {
         stbi__result_info ri;
         float* hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri);
-        if(hdr_data)
+        if (hdr_data)
             stbi__float_postprocess(hdr_data, x, y, comp, req_comp);
         return hdr_data;
     }
 #endif
     data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
-    if(data)
+    if (data)
         return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
     return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
 }
@@ -1004,7 +998,7 @@ extern float* stbi_loadf_from_callbacks(stbi_io_callbacks const* clbk, void* use
                                         int req_comp)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user);
     return stbi__loadf_main(&s, x, y, comp, req_comp);
 }
 
@@ -1013,7 +1007,7 @@ extern float* stbi_loadf(char const* filename, int* x, int* y, int* comp, int re
 {
     float* result;
     FILE* f = stbi__fopen(filename, "rb");
-    if(!f)
+    if (!f)
         return stbi__errpf("can't fopen", "Unable to open file");
     result = stbi_loadf_from_file(f, x, y, comp, req_comp);
     fclose(f);
@@ -1026,9 +1020,9 @@ extern float* stbi_loadf_from_file(FILE* f, int* x, int* y, int* comp, int req_c
     stbi__start_file(&s, f);
     return stbi__loadf_main(&s, x, y, comp, req_comp);
 }
-#endif    // !STBI_NO_STDIO
+#endif // !STBI_NO_STDIO
 
-#endif    // !STBI_NO_LINEAR
+#endif // !STBI_NO_LINEAR
 
 // these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
 // defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
@@ -1052,7 +1046,7 @@ extern int stbi_is_hdr(char const* filename)
 {
     FILE* f = stbi__fopen(filename, "rb");
     int result = 0;
-    if(f)
+    if (f)
     {
         result = stbi_is_hdr_from_file(f);
         fclose(f);
@@ -1075,13 +1069,13 @@ extern int stbi_is_hdr_from_file(FILE* f)
     return 0;
 #endif
 }
-#endif    // !STBI_NO_STDIO
+#endif // !STBI_NO_STDIO
 
 extern int stbi_is_hdr_from_callbacks(stbi_io_callbacks const* clbk, void* user)
 {
 #ifndef STBI_NO_HDR
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user);
     return stbi__hdr_test(&s);
 #else
     STBI_NOTUSED(clbk);
@@ -1128,8 +1122,8 @@ enum
 
 static void stbi__refill_buffer(stbi__context* s)
 {
-    int n = (s->io.read)(s->io_user_data, ( char* )s->buffer_start, s->buflen);
-    if(n == 0)
+    int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen);
+    if (n == 0)
     {
         // at end of file, treat same as if from memory, but need to handle case
         // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
@@ -1147,9 +1141,9 @@ static void stbi__refill_buffer(stbi__context* s)
 
 stbi_inline static stbi_uc stbi__get8(stbi__context* s)
 {
-    if(s->img_buffer < s->img_buffer_end)
+    if (s->img_buffer < s->img_buffer_end)
         return *s->img_buffer++;
-    if(s->read_from_callbacks)
+    if (s->read_from_callbacks)
     {
         stbi__refill_buffer(s);
         return *s->img_buffer++;
@@ -1159,13 +1153,13 @@ stbi_inline static stbi_uc stbi__get8(stbi__context* s)
 
 stbi_inline static int stbi__at_eof(stbi__context* s)
 {
-    if(s->io.read)
+    if (s->io.read)
     {
-        if(!(s->io.eof)(s->io_user_data))
+        if (!(s->io.eof)(s->io_user_data))
             return 0;
         // if feof() is true, check if buffer = end
         // special case: we've only got the special 0 character at the end
-        if(s->read_from_callbacks == 0)
+        if (s->read_from_callbacks == 0)
             return 1;
     }
 
@@ -1174,15 +1168,15 @@ stbi_inline static int stbi__at_eof(stbi__context* s)
 
 static void stbi__skip(stbi__context* s, int n)
 {
-    if(n < 0)
+    if (n < 0)
     {
         s->img_buffer = s->img_buffer_end;
         return;
     }
-    if(s->io.read)
+    if (s->io.read)
     {
-        int blen = ( int )(s->img_buffer_end - s->img_buffer);
-        if(blen < n)
+        int blen = (int)(s->img_buffer_end - s->img_buffer);
+        if (blen < n)
         {
             s->img_buffer = s->img_buffer_end;
             (s->io.skip)(s->io_user_data, n - blen);
@@ -1194,23 +1188,23 @@ static void stbi__skip(stbi__context* s, int n)
 
 static int stbi__getn(stbi__context* s, stbi_uc* buffer, int n)
 {
-    if(s->io.read)
+    if (s->io.read)
     {
-        int blen = ( int )(s->img_buffer_end - s->img_buffer);
-        if(blen < n)
+        int blen = (int)(s->img_buffer_end - s->img_buffer);
+        if (blen < n)
         {
             int res, count;
 
             memcpy(buffer, s->img_buffer, blen);
 
-            count = (s->io.read)(s->io_user_data, ( char* )buffer + blen, n - blen);
+            count = (s->io.read)(s->io_user_data, (char*)buffer + blen, n - blen);
             res = (count == (n - blen));
             s->img_buffer = s->img_buffer_end;
             return res;
         }
     }
 
-    if(s->img_buffer + n <= s->img_buffer_end)
+    if (s->img_buffer + n <= s->img_buffer_end)
     {
         memcpy(buffer, s->img_buffer, n);
         s->img_buffer += n;
@@ -1250,7 +1244,7 @@ static stbi__uint32 stbi__get32le(stbi__context* s)
 }
 #endif
 
-#define STBI__BYTECAST(x) ((stbi_uc)(( x )&255))    // truncate int to byte without warnings
+#define STBI__BYTECAST(x) ((stbi_uc)((x)&255)) // truncate int to byte without warnings
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -1273,29 +1267,29 @@ static unsigned char* stbi__convert_format(unsigned char* data, int img_n, int r
     int i, j;
     unsigned char* good;
 
-    if(req_comp == img_n)
+    if (req_comp == img_n)
         return data;
     STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-    good = ( unsigned char* )stbi__malloc_mad3(req_comp, x, y, 0);
-    if(good == NULL)
+    good = (unsigned char*)stbi__malloc_mad3(req_comp, x, y, 0);
+    if (good == NULL)
     {
         STBI_FREE(data);
         return stbi__errpuc("outofmem", "Out of memory");
     }
 
-    for(j = 0; j < ( int )y; ++j)
+    for (j = 0; j < (int)y; ++j)
     {
         unsigned char* src = data + j * x * img_n;
         unsigned char* dest = good + j * x * req_comp;
 
-#define STBI__COMBO(a, b) (( a )*8 + (b))
+#define STBI__COMBO(a, b) ((a)*8 + (b))
 #define STBI__CASE(a, b)    \
     case STBI__COMBO(a, b): \
-        for(i = x - 1; i >= 0; --i, src += a, dest += b)
+        for (i = x - 1; i >= 0; --i, src += a, dest += b)
         // convert source image with img_n components to one with req_comp components;
         // avoid switch per pixel, so use switch per scanline and massive macros
-        switch(STBI__COMBO(img_n, req_comp))
+        switch (STBI__COMBO(img_n, req_comp))
         {
             STBI__CASE(1, 2)
             {
@@ -1357,8 +1351,8 @@ static unsigned char* stbi__convert_format(unsigned char* data, int img_n, int r
                 dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
             }
             break;
-            default:
-                STBI_ASSERT(0);
+        default:
+            STBI_ASSERT(0);
         }
 #undef STBI__CASE
     }
@@ -1377,29 +1371,29 @@ static stbi__uint16* stbi__convert_format16(stbi__uint16* data, int img_n, int r
     int i, j;
     stbi__uint16* good;
 
-    if(req_comp == img_n)
+    if (req_comp == img_n)
         return data;
     STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-    good = ( stbi__uint16* )stbi__malloc((size_t)req_comp * x * y * 2);
-    if(good == NULL)
+    good = (stbi__uint16*)stbi__malloc((size_t)req_comp * x * y * 2);
+    if (good == NULL)
     {
         STBI_FREE(data);
-        return ( stbi__uint16* )stbi__errpuc("outofmem", "Out of memory");
+        return (stbi__uint16*)stbi__errpuc("outofmem", "Out of memory");
     }
 
-    for(j = 0; j < ( int )y; ++j)
+    for (j = 0; j < (int)y; ++j)
     {
         stbi__uint16* src = data + j * x * img_n;
         stbi__uint16* dest = good + j * x * req_comp;
 
-#define STBI__COMBO(a, b) (( a )*8 + (b))
+#define STBI__COMBO(a, b) ((a)*8 + (b))
 #define STBI__CASE(a, b)    \
     case STBI__COMBO(a, b): \
-        for(i = x - 1; i >= 0; --i, src += a, dest += b)
+        for (i = x - 1; i >= 0; --i, src += a, dest += b)
         // convert source image with img_n components to one with req_comp components;
         // avoid switch per pixel, so use switch per scanline and massive macros
-        switch(STBI__COMBO(img_n, req_comp))
+        switch (STBI__COMBO(img_n, req_comp))
         {
             STBI__CASE(1, 2)
             {
@@ -1461,8 +1455,8 @@ static stbi__uint16* stbi__convert_format16(stbi__uint16* data, int img_n, int r
                 dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
             }
             break;
-            default:
-                STBI_ASSERT(0);
+        default:
+            STBI_ASSERT(0);
         }
 #undef STBI__CASE
     }
@@ -1476,26 +1470,26 @@ static float* stbi__ldr_to_hdr(stbi_uc* data, int x, int y, int comp)
 {
     int i, k, n;
     float* output;
-    if(!data)
+    if (!data)
         return NULL;
-    output = ( float* )stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
-    if(output == NULL)
+    output = (float*)stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+    if (output == NULL)
     {
         STBI_FREE(data);
         return stbi__errpf("outofmem", "Out of memory");
     }
     // compute number of non-alpha components
-    if(comp & 1)
+    if (comp & 1)
         n = comp;
     else
         n = comp - 1;
-    for(i = 0; i < x * y; ++i)
+    for (i = 0; i < x * y; ++i)
     {
-        for(k = 0; k < n; ++k)
+        for (k = 0; k < n; ++k)
         {
-            output[i * comp + k] = ( float )(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+            output[i * comp + k] = (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
         }
-        if(k < comp)
+        if (k < comp)
             output[i * comp + k] = data[i * comp + k] / 255.0f;
     }
     STBI_FREE(data);
@@ -1504,43 +1498,43 @@ static float* stbi__ldr_to_hdr(stbi_uc* data, int x, int y, int comp)
 #endif
 
 #ifndef STBI_NO_HDR
-#define stbi__float2int(x) (( int )(x))
+#define stbi__float2int(x) ((int)(x))
 static stbi_uc* stbi__hdr_to_ldr(float* data, int x, int y, int comp)
 {
     int i, k, n;
     stbi_uc* output;
-    if(!data)
+    if (!data)
         return NULL;
-    output = ( stbi_uc* )stbi__malloc_mad3(x, y, comp, 0);
-    if(output == NULL)
+    output = (stbi_uc*)stbi__malloc_mad3(x, y, comp, 0);
+    if (output == NULL)
     {
         STBI_FREE(data);
         return stbi__errpuc("outofmem", "Out of memory");
     }
     // compute number of non-alpha components
-    if(comp & 1)
+    if (comp & 1)
         n = comp;
     else
         n = comp - 1;
-    for(i = 0; i < x * y; ++i)
+    for (i = 0; i < x * y; ++i)
     {
-        for(k = 0; k < n; ++k)
+        for (k = 0; k < n; ++k)
         {
-            float z = ( float )pow((double)data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
-            if(z < 0)
+            float z = (float)pow((double)data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+            if (z < 0)
                 z = 0;
-            if(z > 255)
+            if (z > 255)
                 z = 255;
-            output[i * comp + k] = ( stbi_uc )stbi__float2int(z);
+            output[i * comp + k] = (stbi_uc)stbi__float2int(z);
         }
-        if(k < comp)
+        if (k < comp)
         {
             float z = data[i * comp + k] * 255 + 0.5f;
-            if(z < 0)
+            if (z < 0)
                 z = 0;
-            if(z > 255)
+            if (z > 255)
                 z = 255;
-            output[i * comp + k] = ( stbi_uc )stbi__float2int(z);
+            output[i * comp + k] = (stbi_uc)stbi__float2int(z);
         }
     }
     STBI_FREE(data);
@@ -1572,7 +1566,7 @@ static stbi_uc* stbi__hdr_to_ldr(float* data, int x, int y, int comp)
 #ifndef STBI_NO_JPEG
 
 // huffman decoding acceleration
-#define FAST_BITS 9    // larger handles more cases; smaller stomps less cache
+#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache
 
 typedef struct
 {
@@ -1582,7 +1576,7 @@ typedef struct
     stbi_uc values[256];
     stbi_uc size[257];
     unsigned int maxcode[18];
-    int delta[17];    // old 'firstsymbol' - old 'firstcode'
+    int delta[17]; // old 'firstsymbol' - old 'firstcode'
 } stbi__huffman;
 
 typedef struct
@@ -1611,14 +1605,14 @@ typedef struct
         stbi_uc* data;
         void *raw_data, *raw_coeff;
         stbi_uc* linebuf;
-        short* coeff;    // progressive only
-        int coeff_w, coeff_h;    // number of 8x8 coefficient blocks
+        short* coeff;         // progressive only
+        int coeff_w, coeff_h; // number of 8x8 coefficient blocks
     } img_comp[4];
 
-    stbi__uint32 code_buffer;    // jpeg entropy-coded buffer
-    int code_bits;    // number of valid bits
-    unsigned char marker;    // marker seen while filling entropy buffer
-    int nomore;    // flag if we saw a marker so must stop
+    stbi__uint32 code_buffer; // jpeg entropy-coded buffer
+    int code_bits;            // number of valid bits
+    unsigned char marker;     // marker seen while filling entropy buffer
+    int nomore;               // flag if we saw a marker so must stop
 
     int progressive;
     int spec_start;
@@ -1627,7 +1621,7 @@ typedef struct
     int succ_low;
     int eob_run;
     int jfif;
-    int app14_color_transform;    // Adobe APP14 tag
+    int app14_color_transform; // Adobe APP14 tag
     int rgb;
 
     int scan_n, order[4];
@@ -1645,23 +1639,23 @@ static int stbi__build_huffman(stbi__huffman* h, int* count)
     int i, j, k = 0;
     unsigned int code;
     // build size list for each symbol (from JPEG spec)
-    for(i = 0; i < 16; ++i)
-        for(j = 0; j < count[i]; ++j)
+    for (i = 0; i < 16; ++i)
+        for (j = 0; j < count[i]; ++j)
             h->size[k++] = (stbi_uc)(i + 1);
     h->size[k] = 0;
 
     // compute actual symbols (from jpeg spec)
     code = 0;
     k = 0;
-    for(j = 1; j <= 16; ++j)
+    for (j = 1; j <= 16; ++j)
     {
         // compute delta to add to code to compute symbol id
         h->delta[j] = k - code;
-        if(h->size[k] == j)
+        if (h->size[k] == j)
         {
-            while(h->size[k] == j)
+            while (h->size[k] == j)
                 h->code[k++] = (stbi__uint16)(code++);
-            if(code - 1 >= (1u << j))
+            if (code - 1 >= (1u << j))
                 return stbi__err("bad code lengths", "Corrupt JPEG");
         }
         // compute largest code + 1 for this size, preshifted as needed later
@@ -1672,16 +1666,16 @@ static int stbi__build_huffman(stbi__huffman* h, int* count)
 
     // build non-spec acceleration table; 255 is flag for not-accelerated
     memset(h->fast, 255, 1 << FAST_BITS);
-    for(i = 0; i < k; ++i)
+    for (i = 0; i < k; ++i)
     {
         int s = h->size[i];
-        if(s <= FAST_BITS)
+        if (s <= FAST_BITS)
         {
             int c = h->code[i] << (FAST_BITS - s);
             int m = 1 << (FAST_BITS - s);
-            for(j = 0; j < m; ++j)
+            for (j = 0; j < m; ++j)
             {
-                h->fast[c + j] = ( stbi_uc )i;
+                h->fast[c + j] = (stbi_uc)i;
             }
         }
     }
@@ -1693,26 +1687,26 @@ static int stbi__build_huffman(stbi__huffman* h, int* count)
 static void stbi__build_fast_ac(stbi__int16* fast_ac, stbi__huffman* h)
 {
     int i;
-    for(i = 0; i < (1 << FAST_BITS); ++i)
+    for (i = 0; i < (1 << FAST_BITS); ++i)
     {
         stbi_uc fast = h->fast[i];
         fast_ac[i] = 0;
-        if(fast < 255)
+        if (fast < 255)
         {
             int rs = h->values[fast];
             int run = (rs >> 4) & 15;
             int magbits = rs & 15;
             int len = h->size[fast];
 
-            if(magbits && len + magbits <= FAST_BITS)
+            if (magbits && len + magbits <= FAST_BITS)
             {
                 // magnitude code followed by receive_extend code
                 int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
                 int m = 1 << (magbits - 1);
-                if(k < m)
+                if (k < m)
                     k += (~0U << magbits) + 1;
                 // if the result is small enough, we can fit it in fast_ac table
-                if(k >= -128 && k <= 127)
+                if (k >= -128 && k <= 127)
                     fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits));
             }
         }
@@ -1724,25 +1718,25 @@ static void stbi__grow_buffer_unsafe(stbi__jpeg* j)
     do
     {
         unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
-        if(b == 0xff)
+        if (b == 0xff)
         {
             int c = stbi__get8(j->s);
-            while(c == 0xff)
-                c = stbi__get8(j->s);    // consume fill bytes
-            if(c != 0)
+            while (c == 0xff)
+                c = stbi__get8(j->s); // consume fill bytes
+            if (c != 0)
             {
-                j->marker = ( unsigned char )c;
+                j->marker = (unsigned char)c;
                 j->nomore = 1;
                 return;
             }
         }
         j->code_buffer |= b << (24 - j->code_bits);
         j->code_bits += 8;
-    } while(j->code_bits <= 24);
+    } while (j->code_bits <= 24);
 }
 
 // (1 << n) - 1
-static const stbi__uint32 stbi__bmask[17] = {0,   1,    3,    7,    15,   31,    63,    127,  255,
+static const stbi__uint32 stbi__bmask[17] = {0, 1, 3, 7, 15, 31, 63, 127, 255,
                                              511, 1023, 2047, 4095, 8191, 16383, 32767, 65535};
 
 // decode a jpeg huffman value from the bitstream
@@ -1751,17 +1745,17 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h)
     unsigned int temp;
     int c, k;
 
-    if(j->code_bits < 16)
+    if (j->code_bits < 16)
         stbi__grow_buffer_unsafe(j);
 
     // look at the top FAST_BITS and determine what symbol ID it is,
     // if the code is <= FAST_BITS
     c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
     k = h->fast[c];
-    if(k < 255)
+    if (k < 255)
     {
         int s = h->size[k];
-        if(s > j->code_bits)
+        if (s > j->code_bits)
             return -1;
         j->code_buffer <<= s;
         j->code_bits -= s;
@@ -1775,17 +1769,17 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h)
     // wants to be compared against something shifted to have 16;
     // that way we don't need to shift inside the loop.
     temp = j->code_buffer >> 16;
-    for(k = FAST_BITS + 1;; ++k)
-        if(temp < h->maxcode[k])
+    for (k = FAST_BITS + 1;; ++k)
+        if (temp < h->maxcode[k])
             break;
-    if(k == 17)
+    if (k == 17)
     {
         // error! code not found
         j->code_bits -= 16;
         return -1;
     }
 
-    if(k > j->code_bits)
+    if (k > j->code_bits)
         return -1;
 
     // convert the huffman code to the symbol id
@@ -1799,7 +1793,7 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h)
 }
 
 // bias[n] = (-1<<n) + 1
-static const int stbi__jbias[16] = {0,    -1,   -3,    -7,    -15,   -31,   -63,    -127,
+static const int stbi__jbias[16] = {0, -1, -3, -7, -15, -31, -63, -127,
                                     -255, -511, -1023, -2047, -4095, -8191, -16383, -32767};
 
 // combined JPEG 'receive' and JPEG 'extend', since baseline
@@ -1808,12 +1802,12 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg* j, int n)
 {
     unsigned int k;
     int sgn;
-    if(j->code_bits < n)
+    if (j->code_bits < n)
         stbi__grow_buffer_unsafe(j);
 
-    sgn = ( stbi__int32 )j->code_buffer >> 31;    // sign bit is always in MSB
+    sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
     k = stbi_lrot(j->code_buffer, n);
-    STBI_ASSERT(n >= 0 && n < ( int )(sizeof(stbi__bmask) / sizeof(*stbi__bmask)));
+    STBI_ASSERT(n >= 0 && n < (int)(sizeof(stbi__bmask) / sizeof(*stbi__bmask)));
     j->code_buffer = k & ~stbi__bmask[n];
     k &= stbi__bmask[n];
     j->code_bits -= n;
@@ -1824,7 +1818,7 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg* j, int n)
 stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg* j, int n)
 {
     unsigned int k;
-    if(j->code_bits < n)
+    if (j->code_bits < n)
         stbi__grow_buffer_unsafe(j);
     k = stbi_lrot(j->code_buffer, n);
     j->code_buffer = k & ~stbi__bmask[n];
@@ -1836,7 +1830,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg* j, int n)
 stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg* j)
 {
     unsigned int k;
-    if(j->code_bits < 1)
+    if (j->code_bits < 1)
         stbi__grow_buffer_unsafe(j);
     k = j->code_buffer;
     j->code_buffer <<= 1;
@@ -1860,10 +1854,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman*
     int diff, dc, k;
     int t;
 
-    if(j->code_bits < 16)
+    if (j->code_bits < 16)
         stbi__grow_buffer_unsafe(j);
     t = stbi__jpeg_huff_decode(j, hdc);
-    if(t < 0)
+    if (t < 0)
         return stbi__err("bad huffman code", "Corrupt JPEG");
 
     // 0 all the ac values now so we can do it 32-bits at a time
@@ -1872,7 +1866,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman*
     diff = t ? stbi__extend_receive(j, t) : 0;
     dc = j->img_comp[b].dc_pred + diff;
     j->img_comp[b].dc_pred = dc;
-    data[0] = ( short )(dc * dequant[0]);
+    data[0] = (short)(dc * dequant[0]);
 
     // decode AC components, see JPEG spec
     k = 1;
@@ -1880,31 +1874,31 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman*
     {
         unsigned int zig;
         int c, r, s;
-        if(j->code_bits < 16)
+        if (j->code_bits < 16)
             stbi__grow_buffer_unsafe(j);
         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
         r = fac[c];
-        if(r)
-        {    // fast-AC path
-            k += (r >> 4) & 15;    // run
-            s = r & 15;    // combined length
+        if (r)
+        {                       // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15;         // combined length
             j->code_buffer <<= s;
             j->code_bits -= s;
             // decode into unzigzag'd location
             zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = ( short )((r >> 8) * dequant[zig]);
+            data[zig] = (short)((r >> 8) * dequant[zig]);
         }
         else
         {
             int rs = stbi__jpeg_huff_decode(j, hac);
-            if(rs < 0)
+            if (rs < 0)
                 return stbi__err("bad huffman code", "Corrupt JPEG");
             s = rs & 15;
             r = rs >> 4;
-            if(s == 0)
+            if (s == 0)
             {
-                if(rs != 0xf0)
-                    break;    // end block
+                if (rs != 0xf0)
+                    break; // end block
                 k += 16;
             }
             else
@@ -1912,10 +1906,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman*
                 k += r;
                 // decode into unzigzag'd location
                 zig = stbi__jpeg_dezigzag[k++];
-                data[zig] = ( short )(stbi__extend_receive(j, s) * dequant[zig]);
+                data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]);
             }
         }
-    } while(k < 64);
+    } while (k < 64);
     return 1;
 }
 
@@ -1923,28 +1917,28 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg* j, short data[64], stbi__
 {
     int diff, dc;
     int t;
-    if(j->spec_end != 0)
+    if (j->spec_end != 0)
         return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 
-    if(j->code_bits < 16)
+    if (j->code_bits < 16)
         stbi__grow_buffer_unsafe(j);
 
-    if(j->succ_high == 0)
+    if (j->succ_high == 0)
     {
         // first scan for DC coefficient, must be first
-        memset(data, 0, 64 * sizeof(data[0]));    // 0 all the ac values now
+        memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now
         t = stbi__jpeg_huff_decode(j, hdc);
         diff = t ? stbi__extend_receive(j, t) : 0;
 
         dc = j->img_comp[b].dc_pred + diff;
         j->img_comp[b].dc_pred = dc;
-        data[0] = ( short )(dc << j->succ_low);
+        data[0] = (short)(dc << j->succ_low);
     }
     else
     {
         // refinement scan for DC coefficient
-        if(stbi__jpeg_get_bit(j))
-            data[0] += ( short )(1 << j->succ_low);
+        if (stbi__jpeg_get_bit(j))
+            data[0] += (short)(1 << j->succ_low);
     }
     return 1;
 }
@@ -1954,14 +1948,14 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg* j, short data[64], stbi__
 static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__huffman* hac, stbi__int16* fac)
 {
     int k;
-    if(j->spec_start == 0)
+    if (j->spec_start == 0)
         return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 
-    if(j->succ_high == 0)
+    if (j->succ_high == 0)
     {
         int shift = j->succ_low;
 
-        if(j->eob_run)
+        if (j->eob_run)
         {
             --j->eob_run;
             return 1;
@@ -1972,32 +1966,32 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
         {
             unsigned int zig;
             int c, r, s;
-            if(j->code_bits < 16)
+            if (j->code_bits < 16)
                 stbi__grow_buffer_unsafe(j);
             c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
             r = fac[c];
-            if(r)
-            {    // fast-AC path
-                k += (r >> 4) & 15;    // run
-                s = r & 15;    // combined length
+            if (r)
+            {                       // fast-AC path
+                k += (r >> 4) & 15; // run
+                s = r & 15;         // combined length
                 j->code_buffer <<= s;
                 j->code_bits -= s;
                 zig = stbi__jpeg_dezigzag[k++];
-                data[zig] = ( short )((r >> 8) << shift);
+                data[zig] = (short)((r >> 8) << shift);
             }
             else
             {
                 int rs = stbi__jpeg_huff_decode(j, hac);
-                if(rs < 0)
+                if (rs < 0)
                     return stbi__err("bad huffman code", "Corrupt JPEG");
                 s = rs & 15;
                 r = rs >> 4;
-                if(s == 0)
+                if (s == 0)
                 {
-                    if(r < 15)
+                    if (r < 15)
                     {
                         j->eob_run = (1 << r);
-                        if(r)
+                        if (r)
                             j->eob_run += stbi__jpeg_get_bits(j, r);
                         --j->eob_run;
                         break;
@@ -2008,28 +2002,28 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
                 {
                     k += r;
                     zig = stbi__jpeg_dezigzag[k++];
-                    data[zig] = ( short )(stbi__extend_receive(j, s) << shift);
+                    data[zig] = (short)(stbi__extend_receive(j, s) << shift);
                 }
             }
-        } while(k <= j->spec_end);
+        } while (k <= j->spec_end);
     }
     else
     {
         // refinement scan for these AC coefficients
 
-        short bit = ( short )(1 << j->succ_low);
+        short bit = (short)(1 << j->succ_low);
 
-        if(j->eob_run)
+        if (j->eob_run)
         {
             --j->eob_run;
-            for(k = j->spec_start; k <= j->spec_end; ++k)
+            for (k = j->spec_start; k <= j->spec_end; ++k)
             {
                 short* p = &data[stbi__jpeg_dezigzag[k]];
-                if(*p != 0)
-                    if(stbi__jpeg_get_bit(j))
-                        if((*p & bit) == 0)
+                if (*p != 0)
+                    if (stbi__jpeg_get_bit(j))
+                        if ((*p & bit) == 0)
                         {
-                            if(*p > 0)
+                            if (*p > 0)
                                 *p += bit;
                             else
                                 *p -= bit;
@@ -2043,19 +2037,19 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
             {
                 int r, s;
                 int rs = stbi__jpeg_huff_decode(
-                    j, hac);    // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
-                if(rs < 0)
+                    j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+                if (rs < 0)
                     return stbi__err("bad huffman code", "Corrupt JPEG");
                 s = rs & 15;
                 r = rs >> 4;
-                if(s == 0)
+                if (s == 0)
                 {
-                    if(r < 15)
+                    if (r < 15)
                     {
                         j->eob_run = (1 << r) - 1;
-                        if(r)
+                        if (r)
                             j->eob_run += stbi__jpeg_get_bits(j, r);
-                        r = 64;    // force end of block
+                        r = 64; // force end of block
                     }
                     else
                     {
@@ -2066,25 +2060,25 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
                 }
                 else
                 {
-                    if(s != 1)
+                    if (s != 1)
                         return stbi__err("bad huffman code", "Corrupt JPEG");
                     // sign bit
-                    if(stbi__jpeg_get_bit(j))
+                    if (stbi__jpeg_get_bit(j))
                         s = bit;
                     else
                         s = -bit;
                 }
 
                 // advance by r
-                while(k <= j->spec_end)
+                while (k <= j->spec_end)
                 {
                     short* p = &data[stbi__jpeg_dezigzag[k++]];
-                    if(*p != 0)
+                    if (*p != 0)
                     {
-                        if(stbi__jpeg_get_bit(j))
-                            if((*p & bit) == 0)
+                        if (stbi__jpeg_get_bit(j))
+                            if ((*p & bit) == 0)
                             {
-                                if(*p > 0)
+                                if (*p > 0)
                                     *p += bit;
                                 else
                                     *p -= bit;
@@ -2092,15 +2086,15 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
                     }
                     else
                     {
-                        if(r == 0)
+                        if (r == 0)
                         {
-                            *p = ( short )s;
+                            *p = (short)s;
                             break;
                         }
                         --r;
                     }
                 }
-            } while(k <= j->spec_end);
+            } while (k <= j->spec_end);
         }
     }
     return 1;
@@ -2110,18 +2104,18 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
 stbi_inline static stbi_uc stbi__clamp(int x)
 {
     // trick to use a single test to catch both cases
-    if(( unsigned int )x > 255)
+    if ((unsigned int)x > 255)
     {
-        if(x < 0)
+        if (x < 0)
             return 0;
-        if(x > 255)
+        if (x > 255)
             return 255;
     }
-    return ( stbi_uc )x;
+    return (stbi_uc)x;
 }
 
-#define stbi__f2f(x) (( int )((( x )*4096 + 0.5)))
-#define stbi__fsh(x) (( x )*4096)
+#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
+#define stbi__fsh(x) ((x)*4096)
 
 // derived from jidctint -- DCT_ISLOW
 #define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7)       \
@@ -2168,10 +2162,10 @@ static void stbi__idct_block(stbi_uc* out, int out_stride, short data[64])
     short* d = data;
 
     // columns
-    for(i = 0; i < 8; ++i, ++d, ++v)
+    for (i = 0; i < 8; ++i, ++d, ++v)
     {
         // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-        if(d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0)
+        if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0)
         {
             //    no shortcut                 0     seconds
             //    (1|2|3|4|5|6|7)==0          0     seconds
@@ -2200,7 +2194,7 @@ static void stbi__idct_block(stbi_uc* out, int out_stride, short data[64])
         }
     }
 
-    for(i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride)
+    for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride)
     {
         // no fast case since the first 1D IDCT spread components out
         STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
@@ -2330,14 +2324,14 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
     __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
 
     // load
-    row0 = _mm_load_si128(( const __m128i* )(data + 0 * 8));
-    row1 = _mm_load_si128(( const __m128i* )(data + 1 * 8));
-    row2 = _mm_load_si128(( const __m128i* )(data + 2 * 8));
-    row3 = _mm_load_si128(( const __m128i* )(data + 3 * 8));
-    row4 = _mm_load_si128(( const __m128i* )(data + 4 * 8));
-    row5 = _mm_load_si128(( const __m128i* )(data + 5 * 8));
-    row6 = _mm_load_si128(( const __m128i* )(data + 6 * 8));
-    row7 = _mm_load_si128(( const __m128i* )(data + 7 * 8));
+    row0 = _mm_load_si128((const __m128i*)(data + 0 * 8));
+    row1 = _mm_load_si128((const __m128i*)(data + 1 * 8));
+    row2 = _mm_load_si128((const __m128i*)(data + 2 * 8));
+    row3 = _mm_load_si128((const __m128i*)(data + 3 * 8));
+    row4 = _mm_load_si128((const __m128i*)(data + 4 * 8));
+    row5 = _mm_load_si128((const __m128i*)(data + 5 * 8));
+    row6 = _mm_load_si128((const __m128i*)(data + 6 * 8));
+    row7 = _mm_load_si128((const __m128i*)(data + 7 * 8));
 
     // column pass
     dct_pass(bias_0, 10);
@@ -2367,39 +2361,39 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
 
     {
         // pack
-        __m128i p0 = _mm_packus_epi16(row0, row1);    // a0a1a2a3...a7b0b1b2b3...b7
+        __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
         __m128i p1 = _mm_packus_epi16(row2, row3);
         __m128i p2 = _mm_packus_epi16(row4, row5);
         __m128i p3 = _mm_packus_epi16(row6, row7);
 
         // 8bit 8x8 transpose pass 1
-        dct_interleave8(p0, p2);    // a0e0a1e1...
-        dct_interleave8(p1, p3);    // c0g0c1g1...
+        dct_interleave8(p0, p2); // a0e0a1e1...
+        dct_interleave8(p1, p3); // c0g0c1g1...
 
         // transpose pass 2
-        dct_interleave8(p0, p1);    // a0c0e0g0...
-        dct_interleave8(p2, p3);    // b0d0f0h0...
+        dct_interleave8(p0, p1); // a0c0e0g0...
+        dct_interleave8(p2, p3); // b0d0f0h0...
 
         // transpose pass 3
-        dct_interleave8(p0, p2);    // a0b0c0d0...
-        dct_interleave8(p1, p3);    // a4b4c4d4...
+        dct_interleave8(p0, p2); // a0b0c0d0...
+        dct_interleave8(p1, p3); // a4b4c4d4...
 
         // store
-        _mm_storel_epi64(( __m128i* )out, p0);
+        _mm_storel_epi64((__m128i*)out, p0);
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p0, 0x4e));
+        _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p0, 0x4e));
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, p2);
+        _mm_storel_epi64((__m128i*)out, p2);
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p2, 0x4e));
+        _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p2, 0x4e));
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, p1);
+        _mm_storel_epi64((__m128i*)out, p1);
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p1, 0x4e));
+        _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p1, 0x4e));
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, p3);
+        _mm_storel_epi64((__m128i*)out, p3);
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p3, 0x4e));
+        _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p3, 0x4e));
     }
 
 #undef dct_const
@@ -2413,7 +2407,7 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
 #undef dct_pass
 }
 
-#endif    // STBI_SSE2
+#endif // STBI_SSE2
 
 #ifdef STBI_NEON
 
@@ -2548,19 +2542,19 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
     }
 
         // pass 1
-        dct_trn16(row0, row1);    // a0b0a2b2a4b4a6b6
+        dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
         dct_trn16(row2, row3);
         dct_trn16(row4, row5);
         dct_trn16(row6, row7);
 
         // pass 2
-        dct_trn32(row0, row2);    // a0b0c0d0a4b4c4d4
+        dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
         dct_trn32(row1, row3);
         dct_trn32(row4, row6);
         dct_trn32(row5, row7);
 
         // pass 3
-        dct_trn64(row0, row4);    // a0b0c0d0e0f0g0h0
+        dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
         dct_trn64(row1, row5);
         dct_trn64(row2, row6);
         dct_trn64(row3, row7);
@@ -2659,7 +2653,7 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
 #undef dct_pass
 }
 
-#endif    // STBI_NEON
+#endif // STBI_NEON
 
 #define STBI__MARKER_none 0xff
 // if there's a pending marker from the entropy stream, return that
@@ -2668,17 +2662,17 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
 static stbi_uc stbi__get_marker(stbi__jpeg* j)
 {
     stbi_uc x;
-    if(j->marker != STBI__MARKER_none)
+    if (j->marker != STBI__MARKER_none)
     {
         x = j->marker;
         j->marker = STBI__MARKER_none;
         return x;
     }
     x = stbi__get8(j->s);
-    if(x != 0xff)
+    if (x != 0xff)
         return STBI__MARKER_none;
-    while(x == 0xff)
-        x = stbi__get8(j->s);    // consume repeated 0xff fill bytes
+    while (x == 0xff)
+        x = stbi__get8(j->s); // consume repeated 0xff fill bytes
     return x;
 }
 
@@ -2704,9 +2698,9 @@ static void stbi__jpeg_reset(stbi__jpeg* j)
 static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
 {
     stbi__jpeg_reset(z);
-    if(!z->progressive)
+    if (!z->progressive)
     {
-        if(z->scan_n == 1)
+        if (z->scan_n == 1)
         {
             int i, j;
             STBI_SIMD_ALIGN(short, data[64]);
@@ -2717,24 +2711,24 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
             // component has, independent of interleaved MCU blocking and such
             int w = (z->img_comp[n].x + 7) >> 3;
             int h = (z->img_comp[n].y + 7) >> 3;
-            for(j = 0; j < h; ++j)
+            for (j = 0; j < h; ++j)
             {
-                for(i = 0; i < w; ++i)
+                for (i = 0; i < w; ++i)
                 {
                     int ha = z->img_comp[n].ha;
-                    if(!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
-                                                z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
+                    if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
+                                                 z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
                         return 0;
                     z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2,
                                          data);
                     // every data block is an MCU, so countdown the restart interval
-                    if(--z->todo <= 0)
+                    if (--z->todo <= 0)
                     {
-                        if(z->code_bits < 24)
+                        if (z->code_bits < 24)
                             stbi__grow_buffer_unsafe(z);
                         // if it's NOT a restart, then just bail, so we get corrupt data
                         // rather than no data
-                        if(!STBI__RESTART(z->marker))
+                        if (!STBI__RESTART(z->marker))
                             return 1;
                         stbi__jpeg_reset(z);
                     }
@@ -2743,28 +2737,28 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
             return 1;
         }
         else
-        {    // interleaved
+        { // interleaved
             int i, j, k, x, y;
             STBI_SIMD_ALIGN(short, data[64]);
-            for(j = 0; j < z->img_mcu_y; ++j)
+            for (j = 0; j < z->img_mcu_y; ++j)
             {
-                for(i = 0; i < z->img_mcu_x; ++i)
+                for (i = 0; i < z->img_mcu_x; ++i)
                 {
                     // scan an interleaved mcu... process scan_n components in order
-                    for(k = 0; k < z->scan_n; ++k)
+                    for (k = 0; k < z->scan_n; ++k)
                     {
                         int n = z->order[k];
                         // scan out an mcu's worth of this component; that's just determined
                         // by the basic H and V specified for the component
-                        for(y = 0; y < z->img_comp[n].v; ++y)
+                        for (y = 0; y < z->img_comp[n].v; ++y)
                         {
-                            for(x = 0; x < z->img_comp[n].h; ++x)
+                            for (x = 0; x < z->img_comp[n].h; ++x)
                             {
                                 int x2 = (i * z->img_comp[n].h + x) * 8;
                                 int y2 = (j * z->img_comp[n].v + y) * 8;
                                 int ha = z->img_comp[n].ha;
-                                if(!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
-                                                            z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
+                                if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
+                                                             z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
                                     return 0;
                                 z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2,
                                                      z->img_comp[n].w2, data);
@@ -2773,11 +2767,11 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
                     }
                     // after all interleaved components, that's an interleaved MCU,
                     // so now count down the restart interval
-                    if(--z->todo <= 0)
+                    if (--z->todo <= 0)
                     {
-                        if(z->code_bits < 24)
+                        if (z->code_bits < 24)
                             stbi__grow_buffer_unsafe(z);
-                        if(!STBI__RESTART(z->marker))
+                        if (!STBI__RESTART(z->marker))
                             return 1;
                         stbi__jpeg_reset(z);
                     }
@@ -2788,7 +2782,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
     }
     else
     {
-        if(z->scan_n == 1)
+        if (z->scan_n == 1)
         {
             int i, j;
             int n = z->order[0];
@@ -2798,28 +2792,28 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
             // component has, independent of interleaved MCU blocking and such
             int w = (z->img_comp[n].x + 7) >> 3;
             int h = (z->img_comp[n].y + 7) >> 3;
-            for(j = 0; j < h; ++j)
+            for (j = 0; j < h; ++j)
             {
-                for(i = 0; i < w; ++i)
+                for (i = 0; i < w; ++i)
                 {
                     short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-                    if(z->spec_start == 0)
+                    if (z->spec_start == 0)
                     {
-                        if(!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
                             return 0;
                     }
                     else
                     {
                         int ha = z->img_comp[n].ha;
-                        if(!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                        if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
                             return 0;
                     }
                     // every data block is an MCU, so countdown the restart interval
-                    if(--z->todo <= 0)
+                    if (--z->todo <= 0)
                     {
-                        if(z->code_bits < 24)
+                        if (z->code_bits < 24)
                             stbi__grow_buffer_unsafe(z);
-                        if(!STBI__RESTART(z->marker))
+                        if (!STBI__RESTART(z->marker))
                             return 1;
                         stbi__jpeg_reset(z);
                     }
@@ -2828,37 +2822,37 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
             return 1;
         }
         else
-        {    // interleaved
+        { // interleaved
             int i, j, k, x, y;
-            for(j = 0; j < z->img_mcu_y; ++j)
+            for (j = 0; j < z->img_mcu_y; ++j)
             {
-                for(i = 0; i < z->img_mcu_x; ++i)
+                for (i = 0; i < z->img_mcu_x; ++i)
                 {
                     // scan an interleaved mcu... process scan_n components in order
-                    for(k = 0; k < z->scan_n; ++k)
+                    for (k = 0; k < z->scan_n; ++k)
                     {
                         int n = z->order[k];
                         // scan out an mcu's worth of this component; that's just determined
                         // by the basic H and V specified for the component
-                        for(y = 0; y < z->img_comp[n].v; ++y)
+                        for (y = 0; y < z->img_comp[n].v; ++y)
                         {
-                            for(x = 0; x < z->img_comp[n].h; ++x)
+                            for (x = 0; x < z->img_comp[n].h; ++x)
                             {
                                 int x2 = (i * z->img_comp[n].h + x);
                                 int y2 = (j * z->img_comp[n].v + y);
                                 short* data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
-                                if(!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                                if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
                                     return 0;
                             }
                         }
                     }
                     // after all interleaved components, that's an interleaved MCU,
                     // so now count down the restart interval
-                    if(--z->todo <= 0)
+                    if (--z->todo <= 0)
                     {
-                        if(z->code_bits < 24)
+                        if (z->code_bits < 24)
                             stbi__grow_buffer_unsafe(z);
-                        if(!STBI__RESTART(z->marker))
+                        if (!STBI__RESTART(z->marker))
                             return 1;
                         stbi__jpeg_reset(z);
                     }
@@ -2872,23 +2866,23 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
 static void stbi__jpeg_dequantize(short* data, stbi__uint16* dequant)
 {
     int i;
-    for(i = 0; i < 64; ++i)
+    for (i = 0; i < 64; ++i)
         data[i] *= dequant[i];
 }
 
 static void stbi__jpeg_finish(stbi__jpeg* z)
 {
-    if(z->progressive)
+    if (z->progressive)
     {
         // dequantize and idct the data
         int i, j, n;
-        for(n = 0; n < z->s->img_n; ++n)
+        for (n = 0; n < z->s->img_n; ++n)
         {
             int w = (z->img_comp[n].x + 7) >> 3;
             int h = (z->img_comp[n].y + 7) >> 3;
-            for(j = 0; j < h; ++j)
+            for (j = 0; j < h; ++j)
             {
-                for(i = 0; i < w; ++i)
+                for (i = 0; i < w; ++i)
                 {
                     short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
                     stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
@@ -2903,114 +2897,113 @@ static void stbi__jpeg_finish(stbi__jpeg* z)
 static int stbi__process_marker(stbi__jpeg* z, int m)
 {
     int L;
-    switch(m)
+    switch (m)
     {
-        case STBI__MARKER_none:    // no marker found
-            return stbi__err("expected marker", "Corrupt JPEG");
+    case STBI__MARKER_none: // no marker found
+        return stbi__err("expected marker", "Corrupt JPEG");
 
-        case 0xDD:    // DRI - specify restart interval
-            if(stbi__get16be(z->s) != 4)
-                return stbi__err("bad DRI len", "Corrupt JPEG");
-            z->restart_interval = stbi__get16be(z->s);
-            return 1;
+    case 0xDD: // DRI - specify restart interval
+        if (stbi__get16be(z->s) != 4)
+            return stbi__err("bad DRI len", "Corrupt JPEG");
+        z->restart_interval = stbi__get16be(z->s);
+        return 1;
+
+    case 0xDB: // DQT - define quantization table
+        L = stbi__get16be(z->s) - 2;
+        while (L > 0)
+        {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15, i;
+            if (p != 0 && p != 1)
+                return stbi__err("bad DQT type", "Corrupt JPEG");
+            if (t > 3)
+                return stbi__err("bad DQT table", "Corrupt JPEG");
+
+            for (i = 0; i < 64; ++i)
+                z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+        }
+        return L == 0;
 
-        case 0xDB:    // DQT - define quantization table
-            L = stbi__get16be(z->s) - 2;
-            while(L > 0)
+    case 0xC4: // DHT - define huffman table
+        L = stbi__get16be(z->s) - 2;
+        while (L > 0)
+        {
+            stbi_uc* v;
+            int sizes[16], i, n = 0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3)
+                return stbi__err("bad DHT header", "Corrupt JPEG");
+            for (i = 0; i < 16; ++i)
             {
-                int q = stbi__get8(z->s);
-                int p = q >> 4, sixteen = (p != 0);
-                int t = q & 15, i;
-                if(p != 0 && p != 1)
-                    return stbi__err("bad DQT type", "Corrupt JPEG");
-                if(t > 3)
-                    return stbi__err("bad DQT table", "Corrupt JPEG");
-
-                for(i = 0; i < 64; ++i)
-                    z->dequant[t][stbi__jpeg_dezigzag[i]] =
-                        (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
-                L -= (sixteen ? 129 : 65);
+                sizes[i] = stbi__get8(z->s);
+                n += sizes[i];
             }
-            return L == 0;
-
-        case 0xC4:    // DHT - define huffman table
-            L = stbi__get16be(z->s) - 2;
-            while(L > 0)
+            L -= 17;
+            if (tc == 0)
             {
-                stbi_uc* v;
-                int sizes[16], i, n = 0;
-                int q = stbi__get8(z->s);
-                int tc = q >> 4;
-                int th = q & 15;
-                if(tc > 1 || th > 3)
-                    return stbi__err("bad DHT header", "Corrupt JPEG");
-                for(i = 0; i < 16; ++i)
-                {
-                    sizes[i] = stbi__get8(z->s);
-                    n += sizes[i];
-                }
-                L -= 17;
-                if(tc == 0)
-                {
-                    if(!stbi__build_huffman(z->huff_dc + th, sizes))
-                        return 0;
-                    v = z->huff_dc[th].values;
-                }
-                else
-                {
-                    if(!stbi__build_huffman(z->huff_ac + th, sizes))
-                        return 0;
-                    v = z->huff_ac[th].values;
-                }
-                for(i = 0; i < n; ++i)
-                    v[i] = stbi__get8(z->s);
-                if(tc != 0)
-                    stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
-                L -= n;
+                if (!stbi__build_huffman(z->huff_dc + th, sizes))
+                    return 0;
+                v = z->huff_dc[th].values;
             }
-            return L == 0;
+            else
+            {
+                if (!stbi__build_huffman(z->huff_ac + th, sizes))
+                    return 0;
+                v = z->huff_ac[th].values;
+            }
+            for (i = 0; i < n; ++i)
+                v[i] = stbi__get8(z->s);
+            if (tc != 0)
+                stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+        }
+        return L == 0;
     }
 
     // check for comment block or APP blocks
-    if((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
+    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
     {
         L = stbi__get16be(z->s);
-        if(L < 2)
+        if (L < 2)
         {
-            if(m == 0xFE)
+            if (m == 0xFE)
                 return stbi__err("bad COM len", "Corrupt JPEG");
             else
                 return stbi__err("bad APP len", "Corrupt JPEG");
         }
         L -= 2;
 
-        if(m == 0xE0 && L >= 5)
-        {    // JFIF APP0 segment
+        if (m == 0xE0 && L >= 5)
+        { // JFIF APP0 segment
             static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'};
             int ok = 1;
             int i;
-            for(i = 0; i < 5; ++i)
-                if(stbi__get8(z->s) != tag[i])
+            for (i = 0; i < 5; ++i)
+                if (stbi__get8(z->s) != tag[i])
                     ok = 0;
             L -= 5;
-            if(ok)
+            if (ok)
                 z->jfif = 1;
         }
-        else if(m == 0xEE && L >= 12)
-        {    // Adobe APP14 segment
+        else if (m == 0xEE && L >= 12)
+        { // Adobe APP14 segment
             static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'};
             int ok = 1;
             int i;
-            for(i = 0; i < 6; ++i)
-                if(stbi__get8(z->s) != tag[i])
+            for (i = 0; i < 6; ++i)
+                if (stbi__get8(z->s) != tag[i])
                     ok = 0;
             L -= 6;
-            if(ok)
+            if (ok)
             {
-                stbi__get8(z->s);    // version
-                stbi__get16be(z->s);    // flags0
-                stbi__get16be(z->s);    // flags1
-                z->app14_color_transform = stbi__get8(z->s);    // color transform
+                stbi__get8(z->s);                            // version
+                stbi__get16be(z->s);                         // flags0
+                stbi__get16be(z->s);                         // flags1
+                z->app14_color_transform = stbi__get8(z->s); // color transform
                 L -= 6;
             }
         }
@@ -3028,24 +3021,24 @@ static int stbi__process_scan_header(stbi__jpeg* z)
     int i;
     int Ls = stbi__get16be(z->s);
     z->scan_n = stbi__get8(z->s);
-    if(z->scan_n < 1 || z->scan_n > 4 || z->scan_n > ( int )z->s->img_n)
+    if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n)
         return stbi__err("bad SOS component count", "Corrupt JPEG");
-    if(Ls != 6 + 2 * z->scan_n)
+    if (Ls != 6 + 2 * z->scan_n)
         return stbi__err("bad SOS len", "Corrupt JPEG");
-    for(i = 0; i < z->scan_n; ++i)
+    for (i = 0; i < z->scan_n; ++i)
     {
         int id = stbi__get8(z->s), which;
         int q = stbi__get8(z->s);
-        for(which = 0; which < z->s->img_n; ++which)
-            if(z->img_comp[which].id == id)
+        for (which = 0; which < z->s->img_n; ++which)
+            if (z->img_comp[which].id == id)
                 break;
-        if(which == z->s->img_n)
-            return 0;    // no match
+        if (which == z->s->img_n)
+            return 0; // no match
         z->img_comp[which].hd = q >> 4;
-        if(z->img_comp[which].hd > 3)
+        if (z->img_comp[which].hd > 3)
             return stbi__err("bad DC huff", "Corrupt JPEG");
         z->img_comp[which].ha = q & 15;
-        if(z->img_comp[which].ha > 3)
+        if (z->img_comp[which].ha > 3)
             return stbi__err("bad AC huff", "Corrupt JPEG");
         z->order[i] = which;
     }
@@ -3053,21 +3046,20 @@ static int stbi__process_scan_header(stbi__jpeg* z)
     {
         int aa;
         z->spec_start = stbi__get8(z->s);
-        z->spec_end = stbi__get8(z->s);    // should be 63, but might be 0
+        z->spec_end = stbi__get8(z->s); // should be 63, but might be 0
         aa = stbi__get8(z->s);
         z->succ_high = (aa >> 4);
         z->succ_low = (aa & 15);
-        if(z->progressive)
+        if (z->progressive)
         {
-            if(z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 ||
-               z->succ_low > 13)
+            if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
                 return stbi__err("bad SOS", "Corrupt JPEG");
         }
         else
         {
-            if(z->spec_start != 0)
+            if (z->spec_start != 0)
                 return stbi__err("bad SOS", "Corrupt JPEG");
-            if(z->succ_high != 0 || z->succ_low != 0)
+            if (z->succ_high != 0 || z->succ_low != 0)
                 return stbi__err("bad SOS", "Corrupt JPEG");
             z->spec_end = 63;
         }
@@ -3079,21 +3071,21 @@ static int stbi__process_scan_header(stbi__jpeg* z)
 static int stbi__free_jpeg_components(stbi__jpeg* z, int ncomp, int why)
 {
     int i;
-    for(i = 0; i < ncomp; ++i)
+    for (i = 0; i < ncomp; ++i)
     {
-        if(z->img_comp[i].raw_data)
+        if (z->img_comp[i].raw_data)
         {
             STBI_FREE(z->img_comp[i].raw_data);
             z->img_comp[i].raw_data = NULL;
             z->img_comp[i].data = NULL;
         }
-        if(z->img_comp[i].raw_coeff)
+        if (z->img_comp[i].raw_coeff)
         {
             STBI_FREE(z->img_comp[i].raw_coeff);
             z->img_comp[i].raw_coeff = 0;
             z->img_comp[i].coeff = 0;
         }
-        if(z->img_comp[i].linebuf)
+        if (z->img_comp[i].linebuf)
         {
             STBI_FREE(z->img_comp[i].linebuf);
             z->img_comp[i].linebuf = NULL;
@@ -3107,62 +3099,62 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan)
     stbi__context* s = z->s;
     int Lf, p, i, q, h_max = 1, v_max = 1, c;
     Lf = stbi__get16be(s);
-    if(Lf < 11)
-        return stbi__err("bad SOF len", "Corrupt JPEG");    // JPEG
+    if (Lf < 11)
+        return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG
     p = stbi__get8(s);
-    if(p != 8)
-        return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only");    // JPEG baseline
+    if (p != 8)
+        return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline
     s->img_y = stbi__get16be(s);
-    if(s->img_y == 0)
+    if (s->img_y == 0)
         return stbi__err(
             "no header height",
-            "JPEG format not supported: delayed height");    // Legal, but we don't handle it--but neither does IJG
+            "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
     s->img_x = stbi__get16be(s);
-    if(s->img_x == 0)
-        return stbi__err("0 width", "Corrupt JPEG");    // JPEG requires
+    if (s->img_x == 0)
+        return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires
     c = stbi__get8(s);
-    if(c != 3 && c != 1 && c != 4)
+    if (c != 3 && c != 1 && c != 4)
         return stbi__err("bad component count", "Corrupt JPEG");
     s->img_n = c;
-    for(i = 0; i < c; ++i)
+    for (i = 0; i < c; ++i)
     {
         z->img_comp[i].data = NULL;
         z->img_comp[i].linebuf = NULL;
     }
 
-    if(Lf != 8 + 3 * s->img_n)
+    if (Lf != 8 + 3 * s->img_n)
         return stbi__err("bad SOF len", "Corrupt JPEG");
 
     z->rgb = 0;
-    for(i = 0; i < s->img_n; ++i)
+    for (i = 0; i < s->img_n; ++i)
     {
         static const unsigned char rgb[3] = {'R', 'G', 'B'};
         z->img_comp[i].id = stbi__get8(s);
-        if(s->img_n == 3 && z->img_comp[i].id == rgb[i])
+        if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
             ++z->rgb;
         q = stbi__get8(s);
         z->img_comp[i].h = (q >> 4);
-        if(!z->img_comp[i].h || z->img_comp[i].h > 4)
+        if (!z->img_comp[i].h || z->img_comp[i].h > 4)
             return stbi__err("bad H", "Corrupt JPEG");
         z->img_comp[i].v = q & 15;
-        if(!z->img_comp[i].v || z->img_comp[i].v > 4)
+        if (!z->img_comp[i].v || z->img_comp[i].v > 4)
             return stbi__err("bad V", "Corrupt JPEG");
         z->img_comp[i].tq = stbi__get8(s);
-        if(z->img_comp[i].tq > 3)
+        if (z->img_comp[i].tq > 3)
             return stbi__err("bad TQ", "Corrupt JPEG");
     }
 
-    if(scan != STBI__SCAN_load)
+    if (scan != STBI__SCAN_load)
         return 1;
 
-    if(!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0))
+    if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0))
         return stbi__err("too large", "Image too large to decode");
 
-    for(i = 0; i < s->img_n; ++i)
+    for (i = 0; i < s->img_n; ++i)
     {
-        if(z->img_comp[i].h > h_max)
+        if (z->img_comp[i].h > h_max)
             h_max = z->img_comp[i].h;
-        if(z->img_comp[i].v > v_max)
+        if (z->img_comp[i].v > v_max)
             v_max = z->img_comp[i].v;
     }
 
@@ -3175,7 +3167,7 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan)
     z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w;
     z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h;
 
-    for(i = 0; i < s->img_n; ++i)
+    for (i = 0; i < s->img_n; ++i)
     {
         // number of effective pixels (e.g. for non-interleaved MCU)
         z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max;
@@ -3193,19 +3185,19 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan)
         z->img_comp[i].raw_coeff = 0;
         z->img_comp[i].linebuf = NULL;
         z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
-        if(z->img_comp[i].raw_data == NULL)
+        if (z->img_comp[i].raw_data == NULL)
             return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory"));
         // align blocks for idct using mmx/sse
-        z->img_comp[i].data = ( stbi_uc* )((( size_t )z->img_comp[i].raw_data + 15) & ~15);
-        if(z->progressive)
+        z->img_comp[i].data = (stbi_uc*)(((size_t)z->img_comp[i].raw_data + 15) & ~15);
+        if (z->progressive)
         {
             // w2, h2 are multiples of 8 (see above)
             z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
             z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
             z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
-            if(z->img_comp[i].raw_coeff == NULL)
+            if (z->img_comp[i].raw_coeff == NULL)
                 return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory"));
-            z->img_comp[i].coeff = ( short* )((( size_t )z->img_comp[i].raw_coeff + 15) & ~15);
+            z->img_comp[i].coeff = (short*)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15);
         }
     }
 
@@ -3225,29 +3217,29 @@ static int stbi__decode_jpeg_header(stbi__jpeg* z, int scan)
 {
     int m;
     z->jfif = 0;
-    z->app14_color_transform = -1;    // valid values are 0,1,2
-    z->marker = STBI__MARKER_none;    // initialize cached marker to empty
+    z->app14_color_transform = -1; // valid values are 0,1,2
+    z->marker = STBI__MARKER_none; // initialize cached marker to empty
     m = stbi__get_marker(z);
-    if(!stbi__SOI(m))
+    if (!stbi__SOI(m))
         return stbi__err("no SOI", "Corrupt JPEG");
-    if(scan == STBI__SCAN_type)
+    if (scan == STBI__SCAN_type)
         return 1;
     m = stbi__get_marker(z);
-    while(!stbi__SOF(m))
+    while (!stbi__SOF(m))
     {
-        if(!stbi__process_marker(z, m))
+        if (!stbi__process_marker(z, m))
             return 0;
         m = stbi__get_marker(z);
-        while(m == STBI__MARKER_none)
+        while (m == STBI__MARKER_none)
         {
             // some files have extra padding after their blocks, so ok, we'll scan
-            if(stbi__at_eof(z->s))
+            if (stbi__at_eof(z->s))
                 return stbi__err("no SOF", "Corrupt JPEG");
             m = stbi__get_marker(z);
         }
     }
     z->progressive = stbi__SOF_progressive(m);
-    if(!stbi__process_frame_header(z, scan))
+    if (!stbi__process_frame_header(z, scan))
         return 0;
     return 1;
 }
@@ -3256,30 +3248,30 @@ static int stbi__decode_jpeg_header(stbi__jpeg* z, int scan)
 static int stbi__decode_jpeg_image(stbi__jpeg* j)
 {
     int m;
-    for(m = 0; m < 4; m++)
+    for (m = 0; m < 4; m++)
     {
         j->img_comp[m].raw_data = NULL;
         j->img_comp[m].raw_coeff = NULL;
     }
     j->restart_interval = 0;
-    if(!stbi__decode_jpeg_header(j, STBI__SCAN_load))
+    if (!stbi__decode_jpeg_header(j, STBI__SCAN_load))
         return 0;
     m = stbi__get_marker(j);
-    while(!stbi__EOI(m))
+    while (!stbi__EOI(m))
     {
-        if(stbi__SOS(m))
+        if (stbi__SOS(m))
         {
-            if(!stbi__process_scan_header(j))
+            if (!stbi__process_scan_header(j))
                 return 0;
-            if(!stbi__parse_entropy_coded_data(j))
+            if (!stbi__parse_entropy_coded_data(j))
                 return 0;
-            if(j->marker == STBI__MARKER_none)
+            if (j->marker == STBI__MARKER_none)
             {
                 // handle 0s at the end of image data from IP Kamera 9060
-                while(!stbi__at_eof(j->s))
+                while (!stbi__at_eof(j->s))
                 {
                     int x = stbi__get8(j->s);
-                    if(x == 255)
+                    if (x == 255)
                     {
                         j->marker = stbi__get8(j->s);
                         break;
@@ -3289,23 +3281,23 @@ static int stbi__decode_jpeg_image(stbi__jpeg* j)
                 // return 0
             }
         }
-        else if(stbi__DNL(m))
+        else if (stbi__DNL(m))
         {
             int Ld = stbi__get16be(j->s);
             stbi__uint32 NL = stbi__get16be(j->s);
-            if(Ld != 4)
+            if (Ld != 4)
                 return stbi__err("bad DNL len", "Corrupt JPEG");
-            if(NL != j->s->img_y)
+            if (NL != j->s->img_y)
                 return stbi__err("bad DNL height", "Corrupt JPEG");
         }
         else
         {
-            if(!stbi__process_marker(j, m))
+            if (!stbi__process_marker(j, m))
                 return 0;
         }
         m = stbi__get_marker(j);
     }
-    if(j->progressive)
+    if (j->progressive)
         stbi__jpeg_finish(j);
     return 1;
 }
@@ -3330,7 +3322,7 @@ static stbi_uc* stbi__resample_row_v_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
     // need to generate two samples vertically for every one in input
     int i;
     STBI_NOTUSED(hs);
-    for(i = 0; i < w; ++i)
+    for (i = 0; i < w; ++i)
         out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2);
     return out;
 }
@@ -3341,7 +3333,7 @@ static stbi_uc* stbi__resample_row_h_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
     int i;
     stbi_uc* input = in_near;
 
-    if(w == 1)
+    if (w == 1)
     {
         // if only one sample, can't do any interpolation
         out[0] = out[1] = input[0];
@@ -3350,7 +3342,7 @@ static stbi_uc* stbi__resample_row_h_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
 
     out[0] = input[0];
     out[1] = stbi__div4(input[0] * 3 + input[1] + 2);
-    for(i = 1; i < w - 1; ++i)
+    for (i = 1; i < w - 1; ++i)
     {
         int n = 3 * input[i] + 2;
         out[i * 2 + 0] = stbi__div4(n + input[i - 1]);
@@ -3371,7 +3363,7 @@ static stbi_uc* stbi__resample_row_hv_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
 {
     // need to generate 2x2 samples for every one in input
     int i, t0, t1;
-    if(w == 1)
+    if (w == 1)
     {
         out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
         return out;
@@ -3379,7 +3371,7 @@ static stbi_uc* stbi__resample_row_hv_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
 
     t1 = 3 * in_near[0] + in_far[0];
     out[0] = stbi__div4(t1 + 2);
-    for(i = 1; i < w; ++i)
+    for (i = 1; i < w; ++i)
     {
         t0 = t1;
         t1 = 3 * in_near[i] + in_far[i];
@@ -3399,7 +3391,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
     // need to generate 2x2 samples for every one in input
     int i = 0, t0, t1;
 
-    if(w == 1)
+    if (w == 1)
     {
         out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
         return out;
@@ -3409,19 +3401,19 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
     // process groups of 8 pixels for as long as we can.
     // note we can't handle the last pixel in a row in this loop
     // because we need to handle the filter boundary conditions.
-    for(; i < ((w - 1) & ~7); i += 8)
+    for (; i < ((w - 1) & ~7); i += 8)
     {
 #if defined(STBI_SSE2)
         // load and perform the vertical filtering pass
         // this uses 3*x + y = 4*x + (y - x)
         __m128i zero = _mm_setzero_si128();
-        __m128i farb = _mm_loadl_epi64(( __m128i* )(in_far + i));
-        __m128i nearb = _mm_loadl_epi64(( __m128i* )(in_near + i));
+        __m128i farb = _mm_loadl_epi64((__m128i*)(in_far + i));
+        __m128i nearb = _mm_loadl_epi64((__m128i*)(in_near + i));
         __m128i farw = _mm_unpacklo_epi8(farb, zero);
         __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
         __m128i diff = _mm_sub_epi16(farw, nearw);
         __m128i nears = _mm_slli_epi16(nearw, 2);
-        __m128i curr = _mm_add_epi16(nears, diff);    // current row
+        __m128i curr = _mm_add_epi16(nears, diff); // current row
 
         // horizontal filter works the same based on shifted vers of current
         // row. "prev" is current row shifted right by 1 pixel; we need to
@@ -3453,7 +3445,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
 
         // pack and write output
         __m128i outv = _mm_packus_epi16(de0, de1);
-        _mm_storeu_si128(( __m128i* )(out + i * 2), outv);
+        _mm_storeu_si128((__m128i*)(out + i * 2), outv);
 #elif defined(STBI_NEON)
         // load and perform the vertical filtering pass
         // this uses 3*x + y = 4*x + (y - x)
@@ -3461,7 +3453,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
         uint8x8_t nearb = vld1_u8(in_near + i);
         int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
         int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
-        int16x8_t curr = vaddq_s16(nears, diff);    // current row
+        int16x8_t curr = vaddq_s16(nears, diff); // current row
 
         // horizontal filter works the same based on shifted vers of current
         // row. "prev" is current row shifted right by 1 pixel; we need to
@@ -3498,7 +3490,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
     t1 = 3 * in_near[i] + in_far[i];
     out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
 
-    for(++i; i < w; ++i)
+    for (++i; i < w; ++i)
     {
         t0 = t1;
         t1 = 3 * in_near[i] + in_far[i];
@@ -3518,22 +3510,22 @@ static stbi_uc* stbi__resample_row_generic(stbi_uc* out, stbi_uc* in_near, stbi_
     // resample with nearest-neighbor
     int i, j;
     STBI_NOTUSED(in_far);
-    for(i = 0; i < w; ++i)
-        for(j = 0; j < hs; ++j)
+    for (i = 0; i < w; ++i)
+        for (j = 0; j < hs; ++j)
             out[i * hs + j] = in_near[i];
     return out;
 }
 
 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 // to make sure the code produces the same results in both SIMD and scalar
-#define stbi__float2fixed(x) ((( int )(( x )*4096.0f + 0.5f)) << 8)
+#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
 static void stbi__YCbCr_to_RGB_row(stbi_uc* out, const stbi_uc* y, const stbi_uc* pcb, const stbi_uc* pcr, int count,
                                    int step)
 {
     int i;
-    for(i = 0; i < count; ++i)
+    for (i = 0; i < count; ++i)
     {
-        int y_fixed = (y[i] << 20) + (1 << 19);    // rounding
+        int y_fixed = (y[i] << 20) + (1 << 19); // rounding
         int r, g, b;
         int cr = pcr[i] - 128;
         int cb = pcb[i] - 128;
@@ -3543,30 +3535,30 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc* out, const stbi_uc* y, const stbi_uc
         r >>= 20;
         g >>= 20;
         b >>= 20;
-        if(( unsigned )r > 255)
+        if ((unsigned)r > 255)
         {
-            if(r < 0)
+            if (r < 0)
                 r = 0;
             else
                 r = 255;
         }
-        if(( unsigned )g > 255)
+        if ((unsigned)g > 255)
         {
-            if(g < 0)
+            if (g < 0)
                 g = 0;
             else
                 g = 255;
         }
-        if(( unsigned )b > 255)
+        if ((unsigned)b > 255)
         {
-            if(b < 0)
+            if (b < 0)
                 b = 0;
             else
                 b = 255;
         }
-        out[0] = ( stbi_uc )r;
-        out[1] = ( stbi_uc )g;
-        out[2] = ( stbi_uc )b;
+        out[0] = (stbi_uc)r;
+        out[1] = (stbi_uc)g;
+        out[2] = (stbi_uc)b;
         out[3] = 255;
         out += step;
     }
@@ -3582,25 +3574,25 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
     // step == 3 is pretty ugly on the final interleave, and i'm not convinced
     // it's useful in practice (you wouldn't use it for textures, for example).
     // so just accelerate step == 4 case.
-    if(step == 4)
+    if (step == 4)
     {
         // this is a fairly straightforward implementation and not super-optimized.
         __m128i signflip = _mm_set1_epi8(-0x80);
-        __m128i cr_const0 = _mm_set1_epi16(( short )(1.40200f * 4096.0f + 0.5f));
-        __m128i cr_const1 = _mm_set1_epi16(-( short )(0.71414f * 4096.0f + 0.5f));
-        __m128i cb_const0 = _mm_set1_epi16(-( short )(0.34414f * 4096.0f + 0.5f));
-        __m128i cb_const1 = _mm_set1_epi16(( short )(1.77200f * 4096.0f + 0.5f));
-        __m128i y_bias = _mm_set1_epi8(( char )( unsigned char )128);
-        __m128i xw = _mm_set1_epi16(255);    // alpha channel
-
-        for(; i + 7 < count; i += 8)
+        __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
+        __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
+        __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
+        __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
+        __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
+        __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+        for (; i + 7 < count; i += 8)
         {
             // load
-            __m128i y_bytes = _mm_loadl_epi64(( __m128i* )(y + i));
-            __m128i cr_bytes = _mm_loadl_epi64(( __m128i* )(pcr + i));
-            __m128i cb_bytes = _mm_loadl_epi64(( __m128i* )(pcb + i));
-            __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip);    // -128
-            __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip);    // -128
+            __m128i y_bytes = _mm_loadl_epi64((__m128i*)(y + i));
+            __m128i cr_bytes = _mm_loadl_epi64((__m128i*)(pcr + i));
+            __m128i cb_bytes = _mm_loadl_epi64((__m128i*)(pcb + i));
+            __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+            __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
 
             // unpack to short (and left-shift cr, cb by 8)
             __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
@@ -3634,8 +3626,8 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
             __m128i o1 = _mm_unpackhi_epi16(t0, t1);
 
             // store
-            _mm_storeu_si128(( __m128i* )(out + 0), o0);
-            _mm_storeu_si128(( __m128i* )(out + 16), o1);
+            _mm_storeu_si128((__m128i*)(out + 0), o0);
+            _mm_storeu_si128((__m128i*)(out + 16), o1);
             out += 32;
         }
     }
@@ -3643,16 +3635,16 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
 
 #ifdef STBI_NEON
     // in this version, step=3 support would be easy to add. but is there demand?
-    if(step == 4)
+    if (step == 4)
     {
         // this is a fairly straightforward implementation and not super-optimized.
         uint8x8_t signflip = vdup_n_u8(0x80);
-        int16x8_t cr_const0 = vdupq_n_s16(( short )(1.40200f * 4096.0f + 0.5f));
-        int16x8_t cr_const1 = vdupq_n_s16(-( short )(0.71414f * 4096.0f + 0.5f));
-        int16x8_t cb_const0 = vdupq_n_s16(-( short )(0.34414f * 4096.0f + 0.5f));
-        int16x8_t cb_const1 = vdupq_n_s16(( short )(1.77200f * 4096.0f + 0.5f));
+        int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
+        int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
+        int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
+        int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
 
-        for(; i + 7 < count; i += 8)
+        for (; i + 7 < count; i += 8)
         {
             // load
             uint8x8_t y_bytes = vld1_u8(y + i);
@@ -3689,9 +3681,9 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
     }
 #endif
 
-    for(; i < count; ++i)
+    for (; i < count; ++i)
     {
-        int y_fixed = (y[i] << 20) + (1 << 19);    // rounding
+        int y_fixed = (y[i] << 20) + (1 << 19); // rounding
         int r, g, b;
         int cr = pcr[i] - 128;
         int cb = pcb[i] - 128;
@@ -3701,30 +3693,30 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
         r >>= 20;
         g >>= 20;
         b >>= 20;
-        if(( unsigned )r > 255)
+        if ((unsigned)r > 255)
         {
-            if(r < 0)
+            if (r < 0)
                 r = 0;
             else
                 r = 255;
         }
-        if(( unsigned )g > 255)
+        if ((unsigned)g > 255)
         {
-            if(g < 0)
+            if (g < 0)
                 g = 0;
             else
                 g = 255;
         }
-        if(( unsigned )b > 255)
+        if ((unsigned)b > 255)
         {
-            if(b < 0)
+            if (b < 0)
                 b = 0;
             else
                 b = 255;
         }
-        out[0] = ( stbi_uc )r;
-        out[1] = ( stbi_uc )g;
-        out[2] = ( stbi_uc )b;
+        out[0] = (stbi_uc)r;
+        out[1] = (stbi_uc)g;
+        out[2] = (stbi_uc)b;
         out[3] = 255;
         out += step;
     }
@@ -3739,7 +3731,7 @@ static void stbi__setup_jpeg(stbi__jpeg* j)
     j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
 
 #ifdef STBI_SSE2
-    if(stbi__sse2_available())
+    if (stbi__sse2_available())
     {
         j->idct_block_kernel = stbi__idct_simd;
         j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
@@ -3764,9 +3756,9 @@ typedef struct
 {
     resample_row_func resample;
     stbi_uc *line0, *line1;
-    int hs, vs;    // expansion factor in each axis
-    int w_lores;    // horizontal pixels pre-expansion
-    int ystep;    // how far through vertical expansion we are
+    int hs, vs;  // expansion factor in each axis
+    int w_lores; // horizontal pixels pre-expansion
+    int ystep;   // how far through vertical expansion we are
     int ypos;    // which pre-expansion row we're on
 } stbi__resample;
 
@@ -3780,25 +3772,26 @@ static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
 static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp, int req_comp)
 {
     int n, decode_n, is_rgb;
-    z->s->img_n = 0;    // make stbi__cleanup_jpeg safe
+    z->s->img_n = 0; // make stbi__cleanup_jpeg safe
 
     // validate req_comp
-    if(req_comp < 0 || req_comp > 4)
+    if (req_comp < 0 || req_comp > 4)
         return stbi__errpuc("bad req_comp", "Internal error");
 
     // load a jpeg image from whichever source, but leave in YCbCr format
-    if(!stbi__decode_jpeg_image(z))
+    if (!stbi__decode_jpeg_image(z))
     {
         stbi__cleanup_jpeg(z);
         return NULL;
     }
 
     // determine actual number of components to generate
-    n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+    n = req_comp ? req_comp : z->s->img_n >= 3 ? 3
+                                               : 1;
 
     is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
 
-    if(z->s->img_n == 3 && n < 3 && !is_rgb)
+    if (z->s->img_n == 3 && n < 3 && !is_rgb)
         decode_n = 1;
     else
         decode_n = z->s->img_n;
@@ -3812,14 +3805,14 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
 
         stbi__resample res_comp[4];
 
-        for(k = 0; k < decode_n; ++k)
+        for (k = 0; k < decode_n; ++k)
         {
             stbi__resample* r = &res_comp[k];
 
             // allocate line buffer big enough for upsampling off the edges
             // with upsample factor of 4
-            z->img_comp[k].linebuf = ( stbi_uc* )stbi__malloc(z->s->img_x + 3);
-            if(!z->img_comp[k].linebuf)
+            z->img_comp[k].linebuf = (stbi_uc*)stbi__malloc(z->s->img_x + 3);
+            if (!z->img_comp[k].linebuf)
             {
                 stbi__cleanup_jpeg(z);
                 return stbi__errpuc("outofmem", "Out of memory");
@@ -3832,52 +3825,52 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
             r->ypos = 0;
             r->line0 = r->line1 = z->img_comp[k].data;
 
-            if(r->hs == 1 && r->vs == 1)
+            if (r->hs == 1 && r->vs == 1)
                 r->resample = resample_row_1;
-            else if(r->hs == 1 && r->vs == 2)
+            else if (r->hs == 1 && r->vs == 2)
                 r->resample = stbi__resample_row_v_2;
-            else if(r->hs == 2 && r->vs == 1)
+            else if (r->hs == 2 && r->vs == 1)
                 r->resample = stbi__resample_row_h_2;
-            else if(r->hs == 2 && r->vs == 2)
+            else if (r->hs == 2 && r->vs == 2)
                 r->resample = z->resample_row_hv_2_kernel;
             else
                 r->resample = stbi__resample_row_generic;
         }
 
         // can't error after this so, this is safe
-        output = ( stbi_uc* )stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
-        if(!output)
+        output = (stbi_uc*)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+        if (!output)
         {
             stbi__cleanup_jpeg(z);
             return stbi__errpuc("outofmem", "Out of memory");
         }
 
         // now go ahead and resample
-        for(j = 0; j < z->s->img_y; ++j)
+        for (j = 0; j < z->s->img_y; ++j)
         {
             stbi_uc* out = output + n * z->s->img_x * j;
-            for(k = 0; k < decode_n; ++k)
+            for (k = 0; k < decode_n; ++k)
             {
                 stbi__resample* r = &res_comp[k];
                 int y_bot = r->ystep >= (r->vs >> 1);
                 coutput[k] = r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0,
                                          y_bot ? r->line0 : r->line1, r->w_lores, r->hs);
-                if(++r->ystep >= r->vs)
+                if (++r->ystep >= r->vs)
                 {
                     r->ystep = 0;
                     r->line0 = r->line1;
-                    if(++r->ypos < z->img_comp[k].y)
+                    if (++r->ypos < z->img_comp[k].y)
                         r->line1 += z->img_comp[k].w2;
                 }
             }
-            if(n >= 3)
+            if (n >= 3)
             {
                 stbi_uc* y = coutput[0];
-                if(z->s->img_n == 3)
+                if (z->s->img_n == 3)
                 {
-                    if(is_rgb)
+                    if (is_rgb)
                     {
-                        for(i = 0; i < z->s->img_x; ++i)
+                        for (i = 0; i < z->s->img_x; ++i)
                         {
                             out[0] = y[i];
                             out[1] = coutput[1][i];
@@ -3891,11 +3884,11 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                         z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
                     }
                 }
-                else if(z->s->img_n == 4)
+                else if (z->s->img_n == 4)
                 {
-                    if(z->app14_color_transform == 0)
-                    {    // CMYK
-                        for(i = 0; i < z->s->img_x; ++i)
+                    if (z->app14_color_transform == 0)
+                    { // CMYK
+                        for (i = 0; i < z->s->img_x; ++i)
                         {
                             stbi_uc m = coutput[3][i];
                             out[0] = stbi__blinn_8x8(coutput[0][i], m);
@@ -3905,10 +3898,10 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                             out += n;
                         }
                     }
-                    else if(z->app14_color_transform == 2)
-                    {    // YCCK
+                    else if (z->app14_color_transform == 2)
+                    { // YCCK
                         z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-                        for(i = 0; i < z->s->img_x; ++i)
+                        for (i = 0; i < z->s->img_x; ++i)
                         {
                             stbi_uc m = coutput[3][i];
                             out[0] = stbi__blinn_8x8(255 - out[0], m);
@@ -3918,37 +3911,37 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                         }
                     }
                     else
-                    {    // YCbCr + alpha?  Ignore the fourth channel for now
+                    { // YCbCr + alpha?  Ignore the fourth channel for now
                         z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
                     }
                 }
                 else
-                    for(i = 0; i < z->s->img_x; ++i)
+                    for (i = 0; i < z->s->img_x; ++i)
                     {
                         out[0] = out[1] = out[2] = y[i];
-                        out[3] = 255;    // not used if n==3
+                        out[3] = 255; // not used if n==3
                         out += n;
                     }
             }
             else
             {
-                if(is_rgb)
+                if (is_rgb)
                 {
-                    if(n == 1)
-                        for(i = 0; i < z->s->img_x; ++i)
+                    if (n == 1)
+                        for (i = 0; i < z->s->img_x; ++i)
                             *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
                     else
                     {
-                        for(i = 0; i < z->s->img_x; ++i, out += 2)
+                        for (i = 0; i < z->s->img_x; ++i, out += 2)
                         {
                             out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
                             out[1] = 255;
                         }
                     }
                 }
-                else if(z->s->img_n == 4 && z->app14_color_transform == 0)
+                else if (z->s->img_n == 4 && z->app14_color_transform == 0)
                 {
-                    for(i = 0; i < z->s->img_x; ++i)
+                    for (i = 0; i < z->s->img_x; ++i)
                     {
                         stbi_uc m = coutput[3][i];
                         stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
@@ -3959,9 +3952,9 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                         out += n;
                     }
                 }
-                else if(z->s->img_n == 4 && z->app14_color_transform == 2)
+                else if (z->s->img_n == 4 && z->app14_color_transform == 2)
                 {
-                    for(i = 0; i < z->s->img_x; ++i)
+                    for (i = 0; i < z->s->img_x; ++i)
                     {
                         out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
                         out[1] = 255;
@@ -3971,11 +3964,11 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                 else
                 {
                     stbi_uc* y = coutput[0];
-                    if(n == 1)
-                        for(i = 0; i < z->s->img_x; ++i)
+                    if (n == 1)
+                        for (i = 0; i < z->s->img_x; ++i)
                             out[i] = y[i];
                     else
-                        for(i = 0; i < z->s->img_x; ++i)
+                        for (i = 0; i < z->s->img_x; ++i)
                             *out++ = y[i], *out++ = 255;
                 }
             }
@@ -3983,8 +3976,8 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
         stbi__cleanup_jpeg(z);
         *out_x = z->s->img_x;
         *out_y = z->s->img_y;
-        if(comp)
-            *comp = z->s->img_n >= 3 ? 3 : 1;    // report original components, not output
+        if (comp)
+            *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
         return output;
     }
 }
@@ -3992,7 +3985,7 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
 static void* stbi__jpeg_load(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri)
 {
     unsigned char* result;
-    stbi__jpeg* j = ( stbi__jpeg* )stbi__malloc(sizeof(stbi__jpeg));
+    stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
     STBI_NOTUSED(ri);
     j->s = s;
     stbi__setup_jpeg(j);
@@ -4004,7 +3997,7 @@ static void* stbi__jpeg_load(stbi__context* s, int* x, int* y, int* comp, int re
 static int stbi__jpeg_test(stbi__context* s)
 {
     int r;
-    stbi__jpeg* j = ( stbi__jpeg* )stbi__malloc(sizeof(stbi__jpeg));
+    stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
     j->s = s;
     stbi__setup_jpeg(j);
     r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
@@ -4015,16 +4008,16 @@ static int stbi__jpeg_test(stbi__context* s)
 
 static int stbi__jpeg_info_raw(stbi__jpeg* j, int* x, int* y, int* comp)
 {
-    if(!stbi__decode_jpeg_header(j, STBI__SCAN_header))
+    if (!stbi__decode_jpeg_header(j, STBI__SCAN_header))
     {
         stbi__rewind(j->s);
         return 0;
     }
-    if(x)
+    if (x)
         *x = j->s->img_x;
-    if(y)
+    if (y)
         *y = j->s->img_y;
-    if(comp)
+    if (comp)
         *comp = j->s->img_n >= 3 ? 3 : 1;
     return 1;
 }
@@ -4032,7 +4025,7 @@ static int stbi__jpeg_info_raw(stbi__jpeg* j, int* x, int* y, int* comp)
 static int stbi__jpeg_info(stbi__context* s, int* x, int* y, int* comp)
 {
     int result;
-    stbi__jpeg* j = ( stbi__jpeg* )(stbi__malloc(sizeof(stbi__jpeg)));
+    stbi__jpeg* j = (stbi__jpeg*)(stbi__malloc(sizeof(stbi__jpeg)));
     j->s = s;
     result = stbi__jpeg_info_raw(j, x, y, comp);
     STBI_FREE(j);
@@ -4050,7 +4043,7 @@ static int stbi__jpeg_info(stbi__context* s, int* x, int* y, int* comp)
 #ifndef STBI_NO_ZLIB
 
 // fast-way is faster to check than jpeg huffman, but slow way is slower
-#define STBI__ZFAST_BITS 9    // accelerate all cases in default tables
+#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables
 #define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1)
 
 // zlib-style huffman encoding
@@ -4090,40 +4083,40 @@ static int stbi__zbuild_huffman(stbi__zhuffman* z, const stbi_uc* sizelist, int
     // DEFLATE spec for generating codes
     memset(sizes, 0, sizeof(sizes));
     memset(z->fast, 0, sizeof(z->fast));
-    for(i = 0; i < num; ++i)
+    for (i = 0; i < num; ++i)
         ++sizes[sizelist[i]];
     sizes[0] = 0;
-    for(i = 1; i < 16; ++i)
-        if(sizes[i] > (1 << i))
+    for (i = 1; i < 16; ++i)
+        if (sizes[i] > (1 << i))
             return stbi__err("bad sizes", "Corrupt PNG");
     code = 0;
-    for(i = 1; i < 16; ++i)
+    for (i = 1; i < 16; ++i)
     {
         next_code[i] = code;
-        z->firstcode[i] = ( stbi__uint16 )code;
-        z->firstsymbol[i] = ( stbi__uint16 )k;
+        z->firstcode[i] = (stbi__uint16)code;
+        z->firstsymbol[i] = (stbi__uint16)k;
         code = (code + sizes[i]);
-        if(sizes[i])
-            if(code - 1 >= (1 << i))
+        if (sizes[i])
+            if (code - 1 >= (1 << i))
                 return stbi__err("bad codelengths", "Corrupt PNG");
-        z->maxcode[i] = code << (16 - i);    // preshift for inner loop
+        z->maxcode[i] = code << (16 - i); // preshift for inner loop
         code <<= 1;
         k += sizes[i];
     }
-    z->maxcode[16] = 0x10000;    // sentinel
-    for(i = 0; i < num; ++i)
+    z->maxcode[16] = 0x10000; // sentinel
+    for (i = 0; i < num; ++i)
     {
         int s = sizelist[i];
-        if(s)
+        if (s)
         {
             int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
             stbi__uint16 fastv = (stbi__uint16)((s << 9) | i);
-            z->size[c] = ( stbi_uc )s;
-            z->value[c] = ( stbi__uint16 )i;
-            if(s <= STBI__ZFAST_BITS)
+            z->size[c] = (stbi_uc)s;
+            z->value[c] = (stbi__uint16)i;
+            if (s <= STBI__ZFAST_BITS)
             {
                 int j = stbi__bit_reverse(next_code[s], s);
-                while(j < (1 << STBI__ZFAST_BITS))
+                while (j < (1 << STBI__ZFAST_BITS))
                 {
                     z->fast[j] = fastv;
                     j += (1 << s);
@@ -4157,7 +4150,7 @@ typedef struct
 
 stbi_inline static stbi_uc stbi__zget8(stbi__zbuf* z)
 {
-    if(z->zbuffer >= z->zbuffer_end)
+    if (z->zbuffer >= z->zbuffer_end)
         return 0;
     return *z->zbuffer++;
 }
@@ -4167,15 +4160,15 @@ static void stbi__fill_bits(stbi__zbuf* z)
     do
     {
         STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
-        z->code_buffer |= ( unsigned int )stbi__zget8(z) << z->num_bits;
+        z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits;
         z->num_bits += 8;
-    } while(z->num_bits <= 24);
+    } while (z->num_bits <= 24);
 }
 
 stbi_inline static unsigned int stbi__zreceive(stbi__zbuf* z, int n)
 {
     unsigned int k;
-    if(z->num_bits < n)
+    if (z->num_bits < n)
         stbi__fill_bits(z);
     k = z->code_buffer & ((1 << n) - 1);
     z->code_buffer >>= n;
@@ -4189,11 +4182,11 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf* a, stbi__zhuffman* z)
     // not resolved by fast table, so compute it the slow way
     // use jpeg approach, which requires MSbits at top
     k = stbi__bit_reverse(a->code_buffer, 16);
-    for(s = STBI__ZFAST_BITS + 1;; ++s)
-        if(k < z->maxcode[s])
+    for (s = STBI__ZFAST_BITS + 1;; ++s)
+        if (k < z->maxcode[s])
             break;
-    if(s == 16)
-        return -1;    // invalid code!
+    if (s == 16)
+        return -1; // invalid code!
     // code size is s, so:
     b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
     STBI_ASSERT(z->size[b] == s);
@@ -4205,10 +4198,10 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf* a, stbi__zhuffman* z)
 stbi_inline static int stbi__zhuffman_decode(stbi__zbuf* a, stbi__zhuffman* z)
 {
     int b, s;
-    if(a->num_bits < 16)
+    if (a->num_bits < 16)
         stbi__fill_bits(a);
     b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
-    if(b)
+    if (b)
     {
         s = b >> 9;
         a->code_buffer >>= s;
@@ -4218,20 +4211,20 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf* a, stbi__zhuffman* z)
     return stbi__zhuffman_decode_slowpath(a, z);
 }
 
-static int stbi__zexpand(stbi__zbuf* z, char* zout, int n)    // need to make room for n bytes
+static int stbi__zexpand(stbi__zbuf* z, char* zout, int n) // need to make room for n bytes
 {
     char* q;
     int cur, limit, old_limit;
     z->zout = zout;
-    if(!z->z_expandable)
+    if (!z->z_expandable)
         return stbi__err("output buffer limit", "Corrupt PNG");
-    cur = ( int )(z->zout - z->zout_start);
-    limit = old_limit = ( int )(z->zout_end - z->zout_start);
-    while(cur + n > limit)
+    cur = (int)(z->zout - z->zout_start);
+    limit = old_limit = (int)(z->zout_end - z->zout_start);
+    while (cur + n > limit)
         limit *= 2;
-    q = ( char* )STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+    q = (char*)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
     STBI_NOTUSED(old_limit);
-    if(q == NULL)
+    if (q == NULL)
         return stbi__err("outofmem", "Out of memory");
     z->zout_start = q;
     z->zout = q + cur;
@@ -4239,82 +4232,82 @@ static int stbi__zexpand(stbi__zbuf* z, char* zout, int n)    // need to make ro
     return 1;
 }
 
-static const int stbi__zlength_base[31] = {3,  4,  5,  6,  7,  8,  9,  10,  11,  13,  15,  17,  19,  23, 27, 31,
-                                           35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0,  0};
+static const int stbi__zlength_base[31] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+                                           35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
 
 static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
                                             3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0};
 
-static const int stbi__zdist_base[32] = {1,    2,    3,    4,    5,    7,     9,     13,    17,  25,   33,
-                                         49,   65,   97,   129,  193,  257,   385,   513,   769, 1025, 1537,
-                                         2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0,   0};
+static const int stbi__zdist_base[32] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33,
+                                         49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537,
+                                         2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0};
 
-static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2,  3,  3,  4,  4,  5,  5,  6,
+static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
                                           6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
 
 static int stbi__parse_huffman_block(stbi__zbuf* a)
 {
     char* zout = a->zout;
-    for(;;)
+    for (;;)
     {
         int z = stbi__zhuffman_decode(a, &a->z_length);
-        if(z < 256)
+        if (z < 256)
         {
-            if(z < 0)
-                return stbi__err("bad huffman code", "Corrupt PNG");    // error in huffman codes
-            if(zout >= a->zout_end)
+            if (z < 0)
+                return stbi__err("bad huffman code", "Corrupt PNG"); // error in huffman codes
+            if (zout >= a->zout_end)
             {
-                if(!stbi__zexpand(a, zout, 1))
+                if (!stbi__zexpand(a, zout, 1))
                     return 0;
                 zout = a->zout;
             }
-            *zout++ = ( char )z;
+            *zout++ = (char)z;
         }
         else
         {
             stbi_uc* p;
             int len, dist;
-            if(z == 256)
+            if (z == 256)
             {
                 a->zout = zout;
                 return 1;
             }
             z -= 257;
             len = stbi__zlength_base[z];
-            if(stbi__zlength_extra[z])
+            if (stbi__zlength_extra[z])
                 len += stbi__zreceive(a, stbi__zlength_extra[z]);
             z = stbi__zhuffman_decode(a, &a->z_distance);
-            if(z < 0)
+            if (z < 0)
                 return stbi__err("bad huffman code", "Corrupt PNG");
             dist = stbi__zdist_base[z];
-            if(stbi__zdist_extra[z])
+            if (stbi__zdist_extra[z])
                 dist += stbi__zreceive(a, stbi__zdist_extra[z]);
-            if(zout - a->zout_start < dist)
+            if (zout - a->zout_start < dist)
                 return stbi__err("bad dist", "Corrupt PNG");
-            if(zout + len > a->zout_end)
+            if (zout + len > a->zout_end)
             {
-                if(!stbi__zexpand(a, zout, len))
+                if (!stbi__zexpand(a, zout, len))
                     return 0;
                 zout = a->zout;
             }
-            p = ( stbi_uc* )(zout - dist);
-            if(dist == 1)
-            {    // run of one byte; common in images.
+            p = (stbi_uc*)(zout - dist);
+            if (dist == 1)
+            { // run of one byte; common in images.
                 stbi_uc v = *p;
-                if(len)
+                if (len)
                 {
                     do
                         *zout++ = v;
-                    while(--len);
+                    while (--len);
                 }
             }
             else
             {
-                if(len)
+                if (len)
                 {
                     do
                         *zout++ = *p++;
-                    while(--len);
+                    while (--len);
                 }
             }
         }
@@ -4325,7 +4318,7 @@ static int stbi__compute_huffman_codes(stbi__zbuf* a)
 {
     static const stbi_uc length_dezigzag[19] = {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
     stbi__zhuffman z_codelength;
-    stbi_uc lencodes[286 + 32 + 137];    // padding for maximum single op
+    stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op
     stbi_uc codelength_sizes[19];
     int i, n;
 
@@ -4335,50 +4328,50 @@ static int stbi__compute_huffman_codes(stbi__zbuf* a)
     int ntot = hlit + hdist;
 
     memset(codelength_sizes, 0, sizeof(codelength_sizes));
-    for(i = 0; i < hclen; ++i)
+    for (i = 0; i < hclen; ++i)
     {
         int s = stbi__zreceive(a, 3);
-        codelength_sizes[length_dezigzag[i]] = ( stbi_uc )s;
+        codelength_sizes[length_dezigzag[i]] = (stbi_uc)s;
     }
-    if(!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19))
+    if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19))
         return 0;
 
     n = 0;
-    while(n < ntot)
+    while (n < ntot)
     {
         int c = stbi__zhuffman_decode(a, &z_codelength);
-        if(c < 0 || c >= 19)
+        if (c < 0 || c >= 19)
             return stbi__err("bad codelengths", "Corrupt PNG");
-        if(c < 16)
-            lencodes[n++] = ( stbi_uc )c;
+        if (c < 16)
+            lencodes[n++] = (stbi_uc)c;
         else
         {
             stbi_uc fill = 0;
-            if(c == 16)
+            if (c == 16)
             {
                 c = stbi__zreceive(a, 2) + 3;
-                if(n == 0)
+                if (n == 0)
                     return stbi__err("bad codelengths", "Corrupt PNG");
                 fill = lencodes[n - 1];
             }
-            else if(c == 17)
+            else if (c == 17)
                 c = stbi__zreceive(a, 3) + 3;
             else
             {
                 STBI_ASSERT(c == 18);
                 c = stbi__zreceive(a, 7) + 11;
             }
-            if(ntot - n < c)
+            if (ntot - n < c)
                 return stbi__err("bad codelengths", "Corrupt PNG");
             memset(lencodes + n, fill, c);
             n += c;
         }
     }
-    if(n != ntot)
+    if (n != ntot)
         return stbi__err("bad codelengths", "Corrupt PNG");
-    if(!stbi__zbuild_huffman(&a->z_length, lencodes, hlit))
+    if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit))
         return 0;
-    if(!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist))
+    if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist))
         return 0;
     return 1;
 }
@@ -4387,28 +4380,28 @@ static int stbi__parse_uncompressed_block(stbi__zbuf* a)
 {
     stbi_uc header[4];
     int len, nlen, k;
-    if(a->num_bits & 7)
-        stbi__zreceive(a, a->num_bits & 7);    // discard
+    if (a->num_bits & 7)
+        stbi__zreceive(a, a->num_bits & 7); // discard
     // drain the bit-packed data into header
     k = 0;
-    while(a->num_bits > 0)
+    while (a->num_bits > 0)
     {
-        header[k++] = (stbi_uc)(a->code_buffer & 255);    // suppress MSVC run-time check
+        header[k++] = (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check
         a->code_buffer >>= 8;
         a->num_bits -= 8;
     }
     STBI_ASSERT(a->num_bits == 0);
     // now fill header the normal way
-    while(k < 4)
+    while (k < 4)
         header[k++] = stbi__zget8(a);
     len = header[1] * 256 + header[0];
     nlen = header[3] * 256 + header[2];
-    if(nlen != (len ^ 0xffff))
+    if (nlen != (len ^ 0xffff))
         return stbi__err("zlib corrupt", "Corrupt PNG");
-    if(a->zbuffer + len > a->zbuffer_end)
+    if (a->zbuffer + len > a->zbuffer_end)
         return stbi__err("read past buffer", "Corrupt PNG");
-    if(a->zout + len > a->zout_end)
-        if(!stbi__zexpand(a, a->zout, len))
+    if (a->zout + len > a->zout_end)
+        if (!stbi__zexpand(a, a->zout, len))
             return 0;
     memcpy(a->zout, a->zbuffer, len);
     a->zbuffer += len;
@@ -4422,12 +4415,12 @@ static int stbi__parse_zlib_header(stbi__zbuf* a)
     int cm = cmf & 15;
     /* int cinfo = cmf >> 4; */
     int flg = stbi__zget8(a);
-    if((cmf * 256 + flg) % 31 != 0)
-        return stbi__err("bad zlib header", "Corrupt PNG");    // zlib spec
-    if(flg & 32)
-        return stbi__err("no preset dict", "Corrupt PNG");    // preset dictionary not allowed in png
-    if(cm != 8)
-        return stbi__err("bad compression", "Corrupt PNG");    // DEFLATE required for png
+    if ((cmf * 256 + flg) % 31 != 0)
+        return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
+    if (flg & 32)
+        return stbi__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png
+    if (cm != 8)
+        return stbi__err("bad compression", "Corrupt PNG"); // DEFLATE required for png
     // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
     return 1;
 }
@@ -4459,8 +4452,8 @@ Init algorithm:
 static int stbi__parse_zlib(stbi__zbuf* a, int parse_header)
 {
     int final, type;
-    if(parse_header)
-        if(!stbi__parse_zlib_header(a))
+    if (parse_header)
+        if (!stbi__parse_zlib_header(a))
             return 0;
     a->num_bits = 0;
     a->code_buffer = 0;
@@ -4468,34 +4461,34 @@ static int stbi__parse_zlib(stbi__zbuf* a, int parse_header)
     {
         final = stbi__zreceive(a, 1);
         type = stbi__zreceive(a, 2);
-        if(type == 0)
+        if (type == 0)
         {
-            if(!stbi__parse_uncompressed_block(a))
+            if (!stbi__parse_uncompressed_block(a))
                 return 0;
         }
-        else if(type == 3)
+        else if (type == 3)
         {
             return 0;
         }
         else
         {
-            if(type == 1)
+            if (type == 1)
             {
                 // use fixed code lengths
-                if(!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288))
+                if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288))
                     return 0;
-                if(!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32))
+                if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32))
                     return 0;
             }
             else
             {
-                if(!stbi__compute_huffman_codes(a))
+                if (!stbi__compute_huffman_codes(a))
                     return 0;
             }
-            if(!stbi__parse_huffman_block(a))
+            if (!stbi__parse_huffman_block(a))
                 return 0;
         }
-    } while(!final);
+    } while (!final);
     return 1;
 }
 
@@ -4512,15 +4505,15 @@ static int stbi__do_zlib(stbi__zbuf* a, char* obuf, int olen, int exp, int parse
 extern char* stbi_zlib_decode_malloc_guesssize(const char* buffer, int len, int initial_size, int* outlen)
 {
     stbi__zbuf a;
-    char* p = ( char* )stbi__malloc(initial_size);
-    if(p == NULL)
+    char* p = (char*)stbi__malloc(initial_size);
+    if (p == NULL)
         return NULL;
-    a.zbuffer = ( stbi_uc* )buffer;
-    a.zbuffer_end = ( stbi_uc* )buffer + len;
-    if(stbi__do_zlib(&a, p, initial_size, 1, 1))
+    a.zbuffer = (stbi_uc*)buffer;
+    a.zbuffer_end = (stbi_uc*)buffer + len;
+    if (stbi__do_zlib(&a, p, initial_size, 1, 1))
     {
-        if(outlen)
-            *outlen = ( int )(a.zout - a.zout_start);
+        if (outlen)
+            *outlen = (int)(a.zout - a.zout_start);
         return a.zout_start;
     }
     else
@@ -4539,15 +4532,15 @@ extern char* stbi_zlib_decode_malloc_guesssize_headerflag(const char* buffer, in
                                                           int parse_header)
 {
     stbi__zbuf a;
-    char* p = ( char* )stbi__malloc(initial_size);
-    if(p == NULL)
+    char* p = (char*)stbi__malloc(initial_size);
+    if (p == NULL)
         return NULL;
-    a.zbuffer = ( stbi_uc* )buffer;
-    a.zbuffer_end = ( stbi_uc* )buffer + len;
-    if(stbi__do_zlib(&a, p, initial_size, 1, parse_header))
+    a.zbuffer = (stbi_uc*)buffer;
+    a.zbuffer_end = (stbi_uc*)buffer + len;
+    if (stbi__do_zlib(&a, p, initial_size, 1, parse_header))
     {
-        if(outlen)
-            *outlen = ( int )(a.zout - a.zout_start);
+        if (outlen)
+            *outlen = (int)(a.zout - a.zout_start);
         return a.zout_start;
     }
     else
@@ -4560,10 +4553,10 @@ extern char* stbi_zlib_decode_malloc_guesssize_headerflag(const char* buffer, in
 extern int stbi_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, int ilen)
 {
     stbi__zbuf a;
-    a.zbuffer = ( stbi_uc* )ibuffer;
-    a.zbuffer_end = ( stbi_uc* )ibuffer + ilen;
-    if(stbi__do_zlib(&a, obuffer, olen, 0, 1))
-        return ( int )(a.zout - a.zout_start);
+    a.zbuffer = (stbi_uc*)ibuffer;
+    a.zbuffer_end = (stbi_uc*)ibuffer + ilen;
+    if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+        return (int)(a.zout - a.zout_start);
     else
         return -1;
 }
@@ -4571,15 +4564,15 @@ extern int stbi_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer,
 extern char* stbi_zlib_decode_noheader_malloc(char const* buffer, int len, int* outlen)
 {
     stbi__zbuf a;
-    char* p = ( char* )stbi__malloc(16384);
-    if(p == NULL)
+    char* p = (char*)stbi__malloc(16384);
+    if (p == NULL)
         return NULL;
-    a.zbuffer = ( stbi_uc* )buffer;
-    a.zbuffer_end = ( stbi_uc* )buffer + len;
-    if(stbi__do_zlib(&a, p, 16384, 1, 0))
+    a.zbuffer = (stbi_uc*)buffer;
+    a.zbuffer_end = (stbi_uc*)buffer + len;
+    if (stbi__do_zlib(&a, p, 16384, 1, 0))
     {
-        if(outlen)
-            *outlen = ( int )(a.zout - a.zout_start);
+        if (outlen)
+            *outlen = (int)(a.zout - a.zout_start);
         return a.zout_start;
     }
     else
@@ -4592,10 +4585,10 @@ extern char* stbi_zlib_decode_noheader_malloc(char const* buffer, int len, int*
 extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* ibuffer, int ilen)
 {
     stbi__zbuf a;
-    a.zbuffer = ( stbi_uc* )ibuffer;
-    a.zbuffer_end = ( stbi_uc* )ibuffer + ilen;
-    if(stbi__do_zlib(&a, obuffer, olen, 0, 0))
-        return ( int )(a.zout - a.zout_start);
+    a.zbuffer = (stbi_uc*)ibuffer;
+    a.zbuffer_end = (stbi_uc*)ibuffer + ilen;
+    if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+        return (int)(a.zout - a.zout_start);
     else
         return -1;
 }
@@ -4630,8 +4623,8 @@ static int stbi__check_png_header(stbi__context* s)
 {
     static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
     int i;
-    for(i = 0; i < 8; ++i)
-        if(stbi__get8(s) != png_sig[i])
+    for (i = 0; i < 8; ++i)
+        if (stbi__get8(s) != png_sig[i])
             return stbi__err("bad png sig", "Not a PNG");
     return 1;
 }
@@ -4663,9 +4656,9 @@ static int stbi__paeth(int a, int b, int c)
     int pa = abs(p - a);
     int pb = abs(p - b);
     int pc = abs(p - c);
-    if(pa <= pb && pa <= pc)
+    if (pa <= pb && pa <= pc)
         return a;
-    if(pb <= pc)
+    if (pb <= pc)
         return b;
     return c;
 }
@@ -4681,18 +4674,18 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
     stbi__uint32 i, j, stride = x * out_n * bytes;
     stbi__uint32 img_len, img_width_bytes;
     int k;
-    int img_n = s->img_n;    // copy it into a local for later
+    int img_n = s->img_n; // copy it into a local for later
 
     int output_bytes = out_n * bytes;
     int filter_bytes = img_n * bytes;
     int width = x;
 
     STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1);
-    a->out = ( stbi_uc* )stbi__malloc_mad3(x, y, output_bytes, 0);    // extra bytes to write off the end into
-    if(!a->out)
+    a->out = (stbi_uc*)stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+    if (!a->out)
         return stbi__err("outofmem", "Out of memory");
 
-    if(!stbi__mad3sizes_valid(img_n, x, depth, 7))
+    if (!stbi__mad3sizes_valid(img_n, x, depth, 7))
         return stbi__err("too large", "Corrupt PNG");
     img_width_bytes = (((img_n * x * depth) + 7) >> 3);
     img_len = (img_width_bytes + 1) * y;
@@ -4700,75 +4693,74 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
     // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
     // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
     // so just check for raw_len < img_len always.
-    if(raw_len < img_len)
+    if (raw_len < img_len)
         return stbi__err("not enough pixels", "Corrupt PNG");
 
-    for(j = 0; j < y; ++j)
+    for (j = 0; j < y; ++j)
     {
         stbi_uc* cur = a->out + stride * j;
         stbi_uc* prior;
         int filter = *raw++;
 
-        if(filter > 4)
+        if (filter > 4)
             return stbi__err("invalid filter", "Corrupt PNG");
 
-        if(depth < 8)
+        if (depth < 8)
         {
             STBI_ASSERT(img_width_bytes <= x);
-            cur += x * out_n -
-                   img_width_bytes;    // store output to the rightmost img_len bytes, so we can decode in place
+            cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
             filter_bytes = 1;
             width = img_width_bytes;
         }
-        prior = cur - stride;    // bugfix: need to compute this after 'cur +=' computation above
+        prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
 
         // if first row, use special filter that doesn't sample previous row
-        if(j == 0)
+        if (j == 0)
             filter = first_row_filter[filter];
 
         // handle first byte explicitly
-        for(k = 0; k < filter_bytes; ++k)
+        for (k = 0; k < filter_bytes; ++k)
         {
-            switch(filter)
+            switch (filter)
             {
-                case STBI__F_none:
-                    cur[k] = raw[k];
-                    break;
-                case STBI__F_sub:
-                    cur[k] = raw[k];
-                    break;
-                case STBI__F_up:
-                    cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
-                    break;
-                case STBI__F_avg:
-                    cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
-                    break;
-                case STBI__F_paeth:
-                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0));
-                    break;
-                case STBI__F_avg_first:
-                    cur[k] = raw[k];
-                    break;
-                case STBI__F_paeth_first:
-                    cur[k] = raw[k];
-                    break;
+            case STBI__F_none:
+                cur[k] = raw[k];
+                break;
+            case STBI__F_sub:
+                cur[k] = raw[k];
+                break;
+            case STBI__F_up:
+                cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+                break;
+            case STBI__F_avg:
+                cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
+                break;
+            case STBI__F_paeth:
+                cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0));
+                break;
+            case STBI__F_avg_first:
+                cur[k] = raw[k];
+                break;
+            case STBI__F_paeth_first:
+                cur[k] = raw[k];
+                break;
             }
         }
 
-        if(depth == 8)
+        if (depth == 8)
         {
-            if(img_n != out_n)
-                cur[img_n] = 255;    // first pixel
+            if (img_n != out_n)
+                cur[img_n] = 255; // first pixel
             raw += img_n;
             cur += out_n;
             prior += out_n;
         }
-        else if(depth == 16)
+        else if (depth == 16)
         {
-            if(img_n != out_n)
+            if (img_n != out_n)
             {
-                cur[filter_bytes] = 255;    // first pixel top byte
-                cur[filter_bytes + 1] = 255;    // first pixel bottom byte
+                cur[filter_bytes] = 255;     // first pixel top byte
+                cur[filter_bytes + 1] = 255; // first pixel bottom byte
             }
             raw += filter_bytes;
             cur += output_bytes;
@@ -4782,49 +4774,48 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
         }
 
         // this is a little gross, so that we don't switch per-pixel or per-component
-        if(depth < 8 || img_n == out_n)
+        if (depth < 8 || img_n == out_n)
         {
             int nk = (width - 1) * filter_bytes;
 #define STBI__CASE(f) \
     case f:           \
-        for(k = 0; k < nk; ++k)
-            switch(filter)
+        for (k = 0; k < nk; ++k)
+            switch (filter)
             {
-                // "none" filter turns into a memcpy here; make that explicit.
-                case STBI__F_none:
-                    memcpy(cur, raw, nk);
-                    break;
-                    STBI__CASE(STBI__F_sub)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]);
-                    }
-                    break;
-                    STBI__CASE(STBI__F_up)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
-                    }
-                    break;
-                    STBI__CASE(STBI__F_avg)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1));
-                    }
-                    break;
-                    STBI__CASE(STBI__F_paeth)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] +
-                                                stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes]));
-                    }
-                    break;
-                    STBI__CASE(STBI__F_avg_first)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1));
-                    }
-                    break;
-                    STBI__CASE(STBI__F_paeth_first)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0));
-                    }
-                    break;
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:
+                memcpy(cur, raw, nk);
+                break;
+                STBI__CASE(STBI__F_sub)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]);
+                }
+                break;
+                STBI__CASE(STBI__F_up)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+                }
+                break;
+                STBI__CASE(STBI__F_avg)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1));
+                }
+                break;
+                STBI__CASE(STBI__F_paeth)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes]));
+                }
+                break;
+                STBI__CASE(STBI__F_avg_first)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1));
+                }
+                break;
+                STBI__CASE(STBI__F_paeth_first)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0));
+                }
+                break;
             }
 #undef STBI__CASE
             raw += nk;
@@ -4832,12 +4823,12 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
         else
         {
             STBI_ASSERT(img_n + 1 == out_n);
-#define STBI__CASE(f)                                                                                      \
-    case f:                                                                                                \
-        for(i = x - 1; i >= 1;                                                                             \
-            --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \
-            for(k = 0; k < filter_bytes; ++k)
-            switch(filter)
+#define STBI__CASE(f)                                                                                       \
+    case f:                                                                                                 \
+        for (i = x - 1; i >= 1;                                                                             \
+             --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \
+            for (k = 0; k < filter_bytes; ++k)
+            switch (filter)
             {
                 STBI__CASE(STBI__F_none)
                 {
@@ -4861,8 +4852,7 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
                 break;
                 STBI__CASE(STBI__F_paeth)
                 {
-                    cur[k] =
-                        STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes]));
+                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes]));
                 }
                 break;
                 STBI__CASE(STBI__F_avg_first)
@@ -4880,10 +4870,10 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
 
             // the loop above sets the high byte of the pixels' alpha, but for
             // 16 bit png files we also need the low byte set. we'll do that here.
-            if(depth == 16)
+            if (depth == 16)
             {
-                cur = a->out + stride * j;    // start at the beginning of the row again
-                for(i = 0; i < x; ++i, cur += output_bytes)
+                cur = a->out + stride * j; // start at the beginning of the row again
+                for (i = 0; i < x; ++i, cur += output_bytes)
                 {
                     cur[filter_bytes + 1] = 255;
                 }
@@ -4894,17 +4884,16 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
     // we make a separate pass to expand bits to pixels; for performance,
     // this could run two scanlines behind the above code, so it won't
     // intefere with filtering but will still be in the cache.
-    if(depth < 8)
+    if (depth < 8)
     {
-        for(j = 0; j < y; ++j)
+        for (j = 0; j < y; ++j)
         {
             stbi_uc* cur = a->out + stride * j;
             stbi_uc* in = a->out + stride * j + x * out_n - img_width_bytes;
             // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for
             // 1/2/4-bit png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data
             // that will be skipped in the later loop
-            stbi_uc scale =
-                (color == 0) ? stbi__depth_scale_table[depth] : 1;    // scale grayscale values to 0..255 range
+            stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
 
             // note that the final byte might overshoot and write more data than desired.
             // we can allocate enough data that this never writes out of memory, but it
@@ -4912,35 +4901,35 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
             // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
             // so we need to explicitly clamp the final ones
 
-            if(depth == 4)
+            if (depth == 4)
             {
-                for(k = x * img_n; k >= 2; k -= 2, ++in)
+                for (k = x * img_n; k >= 2; k -= 2, ++in)
                 {
                     *cur++ = scale * ((*in >> 4));
                     *cur++ = scale * ((*in) & 0x0f);
                 }
-                if(k > 0)
+                if (k > 0)
                     *cur++ = scale * ((*in >> 4));
             }
-            else if(depth == 2)
+            else if (depth == 2)
             {
-                for(k = x * img_n; k >= 4; k -= 4, ++in)
+                for (k = x * img_n; k >= 4; k -= 4, ++in)
                 {
                     *cur++ = scale * ((*in >> 6));
                     *cur++ = scale * ((*in >> 4) & 0x03);
                     *cur++ = scale * ((*in >> 2) & 0x03);
                     *cur++ = scale * ((*in) & 0x03);
                 }
-                if(k > 0)
+                if (k > 0)
                     *cur++ = scale * ((*in >> 6));
-                if(k > 1)
+                if (k > 1)
                     *cur++ = scale * ((*in >> 4) & 0x03);
-                if(k > 2)
+                if (k > 2)
                     *cur++ = scale * ((*in >> 2) & 0x03);
             }
-            else if(depth == 1)
+            else if (depth == 1)
             {
-                for(k = x * img_n; k >= 8; k -= 8, ++in)
+                for (k = x * img_n; k >= 8; k -= 8, ++in)
                 {
                     *cur++ = scale * ((*in >> 7));
                     *cur++ = scale * ((*in >> 6) & 0x01);
@@ -4951,29 +4940,29 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
                     *cur++ = scale * ((*in >> 1) & 0x01);
                     *cur++ = scale * ((*in) & 0x01);
                 }
-                if(k > 0)
+                if (k > 0)
                     *cur++ = scale * ((*in >> 7));
-                if(k > 1)
+                if (k > 1)
                     *cur++ = scale * ((*in >> 6) & 0x01);
-                if(k > 2)
+                if (k > 2)
                     *cur++ = scale * ((*in >> 5) & 0x01);
-                if(k > 3)
+                if (k > 3)
                     *cur++ = scale * ((*in >> 4) & 0x01);
-                if(k > 4)
+                if (k > 4)
                     *cur++ = scale * ((*in >> 3) & 0x01);
-                if(k > 5)
+                if (k > 5)
                     *cur++ = scale * ((*in >> 2) & 0x01);
-                if(k > 6)
+                if (k > 6)
                     *cur++ = scale * ((*in >> 1) & 0x01);
             }
-            if(img_n != out_n)
+            if (img_n != out_n)
             {
                 int q;
                 // insert alpha = 255
                 cur = a->out + stride * j;
-                if(img_n == 1)
+                if (img_n == 1)
                 {
-                    for(q = x - 1; q >= 0; --q)
+                    for (q = x - 1; q >= 0; --q)
                     {
                         cur[q * 2 + 1] = 255;
                         cur[q * 2 + 0] = cur[q];
@@ -4982,7 +4971,7 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
                 else
                 {
                     STBI_ASSERT(img_n == 3);
-                    for(q = x - 1; q >= 0; --q)
+                    for (q = x - 1; q >= 0; --q)
                     {
                         cur[q * 4 + 3] = 255;
                         cur[q * 4 + 2] = cur[q * 3 + 2];
@@ -4993,16 +4982,16 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
             }
         }
     }
-    else if(depth == 16)
+    else if (depth == 16)
     {
         // force the image data from big-endian to platform-native.
         // this is done in a separate pass due to the decoding relying
         // on the data being untouched, but could probably be done
         // per-line during decode if care is taken.
         stbi_uc* cur = a->out;
-        stbi__uint16* cur16 = ( stbi__uint16* )cur;
+        stbi__uint16* cur16 = (stbi__uint16*)cur;
 
-        for(i = 0; i < x * y * out_n; ++i, cur16++, cur += 2)
+        for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2)
         {
             *cur16 = (cur[0] << 8) | cur[1];
         }
@@ -5018,12 +5007,12 @@ static int stbi__create_png_image(stbi__png* a, stbi_uc* image_data, stbi__uint3
     int out_bytes = out_n * bytes;
     stbi_uc* final;
     int p;
-    if(!interlaced)
+    if (!interlaced)
         return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
 
     // de-interlacing
-    final = ( stbi_uc* )stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
-    for(p = 0; p < 7; ++p)
+    final = (stbi_uc*)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+    for (p = 0; p < 7; ++p)
     {
         int xorig[] = {0, 4, 0, 2, 0, 1, 0};
         int yorig[] = {0, 0, 4, 0, 2, 0, 1};
@@ -5033,17 +5022,17 @@ static int stbi__create_png_image(stbi__png* a, stbi_uc* image_data, stbi__uint3
         // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
         x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
         y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
-        if(x && y)
+        if (x && y)
         {
             stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
-            if(!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color))
+            if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color))
             {
                 STBI_FREE(final);
                 return 0;
             }
-            for(j = 0; j < y; ++j)
+            for (j = 0; j < y; ++j)
             {
-                for(i = 0; i < x; ++i)
+                for (i = 0; i < x; ++i)
                 {
                     int out_y = j * yspc[p] + yorig[p];
                     int out_x = i * xspc[p] + xorig[p];
@@ -5071,9 +5060,9 @@ static int stbi__compute_transparency(stbi__png* z, stbi_uc tc[3], int out_n)
     // already got 255 as the alpha value in the output
     STBI_ASSERT(out_n == 2 || out_n == 4);
 
-    if(out_n == 2)
+    if (out_n == 2)
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
             p[1] = (p[0] == tc[0] ? 0 : 255);
             p += 2;
@@ -5081,9 +5070,9 @@ static int stbi__compute_transparency(stbi__png* z, stbi_uc tc[3], int out_n)
     }
     else
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
-            if(p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
                 p[3] = 0;
             p += 4;
         }
@@ -5095,15 +5084,15 @@ static int stbi__compute_transparency16(stbi__png* z, stbi__uint16 tc[3], int ou
 {
     stbi__context* s = z->s;
     stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-    stbi__uint16* p = ( stbi__uint16* )z->out;
+    stbi__uint16* p = (stbi__uint16*)z->out;
 
     // compute color-based transparency, assuming we've
     // already got 65535 as the alpha value in the output
     STBI_ASSERT(out_n == 2 || out_n == 4);
 
-    if(out_n == 2)
+    if (out_n == 2)
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
             p[1] = (p[0] == tc[0] ? 0 : 65535);
             p += 2;
@@ -5111,9 +5100,9 @@ static int stbi__compute_transparency16(stbi__png* z, stbi__uint16 tc[3], int ou
     }
     else
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
-            if(p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
                 p[3] = 0;
             p += 4;
         }
@@ -5126,16 +5115,16 @@ static int stbi__expand_png_palette(stbi__png* a, stbi_uc* palette, int len, int
     stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
     stbi_uc *p, *temp_out, *orig = a->out;
 
-    p = ( stbi_uc* )stbi__malloc_mad2(pixel_count, pal_img_n, 0);
-    if(p == NULL)
+    p = (stbi_uc*)stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+    if (p == NULL)
         return stbi__err("outofmem", "Out of memory");
 
     // between here and free(out) below, exitting would leak
     temp_out = p;
 
-    if(pal_img_n == 3)
+    if (pal_img_n == 3)
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
             int n = orig[i] * 4;
             p[0] = palette[n];
@@ -5146,7 +5135,7 @@ static int stbi__expand_png_palette(stbi__png* a, stbi_uc* palette, int len, int
     }
     else
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
             int n = orig[i] * 4;
             p[0] = palette[n];
@@ -5183,9 +5172,9 @@ static void stbi__de_iphone(stbi__png* z)
     stbi__uint32 i, pixel_count = s->img_x * s->img_y;
     stbi_uc* p = z->out;
 
-    if(s->img_out_n == 3)
-    {    // convert bgr to rgb
-        for(i = 0; i < pixel_count; ++i)
+    if (s->img_out_n == 3)
+    { // convert bgr to rgb
+        for (i = 0; i < pixel_count; ++i)
         {
             stbi_uc t = p[0];
             p[0] = p[2];
@@ -5196,14 +5185,14 @@ static void stbi__de_iphone(stbi__png* z)
     else
     {
         STBI_ASSERT(s->img_out_n == 4);
-        if(stbi__unpremultiply_on_load)
+        if (stbi__unpremultiply_on_load)
         {
             // convert bgr to rgb and unpremultiply
-            for(i = 0; i < pixel_count; ++i)
+            for (i = 0; i < pixel_count; ++i)
             {
                 stbi_uc a = p[3];
                 stbi_uc t = p[0];
-                if(a)
+                if (a)
                 {
                     stbi_uc half = a / 2;
                     p[0] = (p[2] * 255 + half) / a;
@@ -5221,7 +5210,7 @@ static void stbi__de_iphone(stbi__png* z)
         else
         {
             // convert bgr to rgb
-            for(i = 0; i < pixel_count; ++i)
+            for (i = 0; i < pixel_count; ++i)
             {
                 stbi_uc t = p[0];
                 p[0] = p[2];
@@ -5233,7 +5222,7 @@ static void stbi__de_iphone(stbi__png* z)
 }
 
 #define STBI__PNG_TYPE(a, b, c, d) \
-    ((( unsigned )(a) << 24) + (( unsigned )(b) << 16) + (( unsigned )(c) << 8) + ( unsigned )(d))
+    (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) + (unsigned)(d))
 
 static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp)
 {
@@ -5248,250 +5237,249 @@ static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp)
     z->idata = NULL;
     z->out = NULL;
 
-    if(!stbi__check_png_header(s))
+    if (!stbi__check_png_header(s))
         return 0;
 
-    if(scan == STBI__SCAN_type)
+    if (scan == STBI__SCAN_type)
         return 1;
 
-    for(;;)
+    for (;;)
     {
         stbi__pngchunk c = stbi__get_chunk_header(s);
-        switch(c.type)
+        switch (c.type)
         {
-            case STBI__PNG_TYPE('C', 'g', 'B', 'I'):
-                is_iphone = 1;
-                stbi__skip(s, c.length);
-                break;
-            case STBI__PNG_TYPE('I', 'H', 'D', 'R'):
+        case STBI__PNG_TYPE('C', 'g', 'B', 'I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+        case STBI__PNG_TYPE('I', 'H', 'D', 'R'):
+        {
+            int comp, filter;
+            if (!first)
+                return stbi__err("multiple IHDR", "Corrupt PNG");
+            first = 0;
+            if (c.length != 13)
+                return stbi__err("bad IHDR len", "Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            if (s->img_x > (1 << 24))
+                return stbi__err("too large", "Very large image (corrupt?)");
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > (1 << 24))
+                return stbi__err("too large", "Very large image (corrupt?)");
+            z->depth = stbi__get8(s);
+            if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)
+                return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);
+            if (color > 6)
+                return stbi__err("bad ctype", "Corrupt PNG");
+            if (color == 3 && z->depth == 16)
+                return stbi__err("bad ctype", "Corrupt PNG");
+            if (color == 3)
+                pal_img_n = 3;
+            else if (color & 1)
+                return stbi__err("bad ctype", "Corrupt PNG");
+            comp = stbi__get8(s);
+            if (comp)
+                return stbi__err("bad comp method", "Corrupt PNG");
+            filter = stbi__get8(s);
+            if (filter)
+                return stbi__err("bad filter method", "Corrupt PNG");
+            interlace = stbi__get8(s);
+            if (interlace > 1)
+                return stbi__err("bad interlace method", "Corrupt PNG");
+            if (!s->img_x || !s->img_y)
+                return stbi__err("0-pixel image", "Corrupt PNG");
+            if (!pal_img_n)
             {
-                int comp, filter;
-                if(!first)
-                    return stbi__err("multiple IHDR", "Corrupt PNG");
-                first = 0;
-                if(c.length != 13)
-                    return stbi__err("bad IHDR len", "Corrupt PNG");
-                s->img_x = stbi__get32be(s);
-                if(s->img_x > (1 << 24))
-                    return stbi__err("too large", "Very large image (corrupt?)");
-                s->img_y = stbi__get32be(s);
-                if(s->img_y > (1 << 24))
-                    return stbi__err("too large", "Very large image (corrupt?)");
-                z->depth = stbi__get8(s);
-                if(z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)
-                    return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only");
-                color = stbi__get8(s);
-                if(color > 6)
-                    return stbi__err("bad ctype", "Corrupt PNG");
-                if(color == 3 && z->depth == 16)
-                    return stbi__err("bad ctype", "Corrupt PNG");
-                if(color == 3)
-                    pal_img_n = 3;
-                else if(color & 1)
-                    return stbi__err("bad ctype", "Corrupt PNG");
-                comp = stbi__get8(s);
-                if(comp)
-                    return stbi__err("bad comp method", "Corrupt PNG");
-                filter = stbi__get8(s);
-                if(filter)
-                    return stbi__err("bad filter method", "Corrupt PNG");
-                interlace = stbi__get8(s);
-                if(interlace > 1)
-                    return stbi__err("bad interlace method", "Corrupt PNG");
-                if(!s->img_x || !s->img_y)
-                    return stbi__err("0-pixel image", "Corrupt PNG");
-                if(!pal_img_n)
-                {
-                    s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
-                    if((1 << 30) / s->img_x / s->img_n < s->img_y)
-                        return stbi__err("too large", "Image too large to decode");
-                    if(scan == STBI__SCAN_header)
-                        return 1;
-                }
-                else
-                {
-                    // if paletted, then pal_n is our final components, and
-                    // img_n is # components to decompress/filter.
-                    s->img_n = 1;
-                    if((1 << 30) / s->img_x / 4 < s->img_y)
-                        return stbi__err("too large", "Corrupt PNG");
-                    // if SCAN_header, have to scan to see if we have a tRNS
-                }
-                break;
+                s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+                if ((1 << 30) / s->img_x / s->img_n < s->img_y)
+                    return stbi__err("too large", "Image too large to decode");
+                if (scan == STBI__SCAN_header)
+                    return 1;
+            }
+            else
+            {
+                // if paletted, then pal_n is our final components, and
+                // img_n is # components to decompress/filter.
+                s->img_n = 1;
+                if ((1 << 30) / s->img_x / 4 < s->img_y)
+                    return stbi__err("too large", "Corrupt PNG");
+                // if SCAN_header, have to scan to see if we have a tRNS
             }
+            break;
+        }
 
-            case STBI__PNG_TYPE('P', 'L', 'T', 'E'):
+        case STBI__PNG_TYPE('P', 'L', 'T', 'E'):
+        {
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256 * 3)
+                return stbi__err("invalid PLTE", "Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length)
+                return stbi__err("invalid PLTE", "Corrupt PNG");
+            for (i = 0; i < pal_len; ++i)
+            {
+                palette[i * 4 + 0] = stbi__get8(s);
+                palette[i * 4 + 1] = stbi__get8(s);
+                palette[i * 4 + 2] = stbi__get8(s);
+                palette[i * 4 + 3] = 255;
+            }
+            break;
+        }
+
+        case STBI__PNG_TYPE('t', 'R', 'N', 'S'):
+        {
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata)
+                return stbi__err("tRNS after IDAT", "Corrupt PNG");
+            if (pal_img_n)
             {
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if(c.length > 256 * 3)
-                    return stbi__err("invalid PLTE", "Corrupt PNG");
-                pal_len = c.length / 3;
-                if(pal_len * 3 != c.length)
-                    return stbi__err("invalid PLTE", "Corrupt PNG");
-                for(i = 0; i < pal_len; ++i)
+                if (scan == STBI__SCAN_header)
                 {
-                    palette[i * 4 + 0] = stbi__get8(s);
-                    palette[i * 4 + 1] = stbi__get8(s);
-                    palette[i * 4 + 2] = stbi__get8(s);
-                    palette[i * 4 + 3] = 255;
+                    s->img_n = 4;
+                    return 1;
                 }
-                break;
+                if (pal_len == 0)
+                    return stbi__err("tRNS before PLTE", "Corrupt PNG");
+                if (c.length > pal_len)
+                    return stbi__err("bad tRNS len", "Corrupt PNG");
+                pal_img_n = 4;
+                for (i = 0; i < c.length; ++i)
+                    palette[i * 4 + 3] = stbi__get8(s);
             }
-
-            case STBI__PNG_TYPE('t', 'R', 'N', 'S'):
+            else
             {
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if(z->idata)
-                    return stbi__err("tRNS after IDAT", "Corrupt PNG");
-                if(pal_img_n)
+                if (!(s->img_n & 1))
+                    return stbi__err("tRNS with alpha", "Corrupt PNG");
+                if (c.length != (stbi__uint32)s->img_n * 2)
+                    return stbi__err("bad tRNS len", "Corrupt PNG");
+                has_trans = 1;
+                if (z->depth == 16)
                 {
-                    if(scan == STBI__SCAN_header)
-                    {
-                        s->img_n = 4;
-                        return 1;
-                    }
-                    if(pal_len == 0)
-                        return stbi__err("tRNS before PLTE", "Corrupt PNG");
-                    if(c.length > pal_len)
-                        return stbi__err("bad tRNS len", "Corrupt PNG");
-                    pal_img_n = 4;
-                    for(i = 0; i < c.length; ++i)
-                        palette[i * 4 + 3] = stbi__get8(s);
+                    for (k = 0; k < s->img_n; ++k)
+                        tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
                 }
                 else
                 {
-                    if(!(s->img_n & 1))
-                        return stbi__err("tRNS with alpha", "Corrupt PNG");
-                    if(c.length != ( stbi__uint32 )s->img_n * 2)
-                        return stbi__err("bad tRNS len", "Corrupt PNG");
-                    has_trans = 1;
-                    if(z->depth == 16)
-                    {
-                        for(k = 0; k < s->img_n; ++k)
-                            tc16[k] = ( stbi__uint16 )stbi__get16be(s);    // copy the values as-is
-                    }
-                    else
-                    {
-                        for(k = 0; k < s->img_n; ++k)
-                            tc[k] = (stbi_uc)(stbi__get16be(s) & 255) *
-                                    stbi__depth_scale_table[z->depth];    // non 8-bit images will be larger
-                    }
+                    for (k = 0; k < s->img_n; ++k)
+                        tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
                 }
-                break;
             }
+            break;
+        }
 
-            case STBI__PNG_TYPE('I', 'D', 'A', 'T'):
+        case STBI__PNG_TYPE('I', 'D', 'A', 'T'):
+        {
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len)
+                return stbi__err("no PLTE", "Corrupt PNG");
+            if (scan == STBI__SCAN_header)
             {
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if(pal_img_n && !pal_len)
-                    return stbi__err("no PLTE", "Corrupt PNG");
-                if(scan == STBI__SCAN_header)
-                {
-                    s->img_n = pal_img_n;
-                    return 1;
-                }
-                if(( int )(ioff + c.length) < ( int )ioff)
-                    return 0;
-                if(ioff + c.length > idata_limit)
-                {
-                    stbi__uint32 idata_limit_old = idata_limit;
-                    stbi_uc* p;
-                    if(idata_limit == 0)
-                        idata_limit = c.length > 4096 ? c.length : 4096;
-                    while(ioff + c.length > idata_limit)
-                        idata_limit *= 2;
-                    STBI_NOTUSED(idata_limit_old);
-                    p = ( stbi_uc* )STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit);
-                    if(p == NULL)
-                        return stbi__err("outofmem", "Out of memory");
-                    z->idata = p;
-                }
-                if(!stbi__getn(s, z->idata + ioff, c.length))
-                    return stbi__err("outofdata", "Corrupt PNG");
-                ioff += c.length;
-                break;
+                s->img_n = pal_img_n;
+                return 1;
+            }
+            if ((int)(ioff + c.length) < (int)ioff)
+                return 0;
+            if (ioff + c.length > idata_limit)
+            {
+                stbi__uint32 idata_limit_old = idata_limit;
+                stbi_uc* p;
+                if (idata_limit == 0)
+                    idata_limit = c.length > 4096 ? c.length : 4096;
+                while (ioff + c.length > idata_limit)
+                    idata_limit *= 2;
+                STBI_NOTUSED(idata_limit_old);
+                p = (stbi_uc*)STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit);
+                if (p == NULL)
+                    return stbi__err("outofmem", "Out of memory");
+                z->idata = p;
             }
+            if (!stbi__getn(s, z->idata + ioff, c.length))
+                return stbi__err("outofdata", "Corrupt PNG");
+            ioff += c.length;
+            break;
+        }
 
-            case STBI__PNG_TYPE('I', 'E', 'N', 'D'):
+        case STBI__PNG_TYPE('I', 'E', 'N', 'D'):
+        {
+            stbi__uint32 raw_len, bpl;
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load)
+                return 1;
+            if (z->idata == NULL)
+                return stbi__err("no IDAT", "Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc*)stbi_zlib_decode_malloc_guesssize_headerflag((char*)z->idata, ioff, raw_len,
+                                                                                 (int*)&raw_len, !is_iphone);
+            if (z->expanded == NULL)
+                return 0; // zlib should set error
+            STBI_FREE(z->idata);
+            z->idata = NULL;
+            if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans)
+                s->img_out_n = s->img_n + 1;
+            else
+                s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace))
+                return 0;
+            if (has_trans)
             {
-                stbi__uint32 raw_len, bpl;
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if(scan != STBI__SCAN_load)
-                    return 1;
-                if(z->idata == NULL)
-                    return stbi__err("no IDAT", "Corrupt PNG");
-                // initial guess for decoded data size to avoid unnecessary reallocs
-                bpl = (s->img_x * z->depth + 7) / 8;    // bytes per line, per component
-                raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
-                z->expanded = ( stbi_uc* )stbi_zlib_decode_malloc_guesssize_headerflag(( char* )z->idata, ioff, raw_len,
-                                                                                       ( int* )&raw_len, !is_iphone);
-                if(z->expanded == NULL)
-                    return 0;    // zlib should set error
-                STBI_FREE(z->idata);
-                z->idata = NULL;
-                if((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans)
-                    s->img_out_n = s->img_n + 1;
-                else
-                    s->img_out_n = s->img_n;
-                if(!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace))
-                    return 0;
-                if(has_trans)
+                if (z->depth == 16)
                 {
-                    if(z->depth == 16)
-                    {
-                        if(!stbi__compute_transparency16(z, tc16, s->img_out_n))
-                            return 0;
-                    }
-                    else
-                    {
-                        if(!stbi__compute_transparency(z, tc, s->img_out_n))
-                            return 0;
-                    }
-                }
-                if(is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
-                    stbi__de_iphone(z);
-                if(pal_img_n)
-                {
-                    // pal_img_n == 3 or 4
-                    s->img_n = pal_img_n;    // record the actual colors we had
-                    s->img_out_n = pal_img_n;
-                    if(req_comp >= 3)
-                        s->img_out_n = req_comp;
-                    if(!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                    if (!stbi__compute_transparency16(z, tc16, s->img_out_n))
                         return 0;
                 }
-                else if(has_trans)
+                else
                 {
-                    // non-paletted image with tRNS -> source image has (constant) alpha
-                    ++s->img_n;
+                    if (!stbi__compute_transparency(z, tc, s->img_out_n))
+                        return 0;
                 }
-                STBI_FREE(z->expanded);
-                z->expanded = NULL;
-                return 1;
             }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+                stbi__de_iphone(z);
+            if (pal_img_n)
+            {
+                // pal_img_n == 3 or 4
+                s->img_n = pal_img_n; // record the actual colors we had
+                s->img_out_n = pal_img_n;
+                if (req_comp >= 3)
+                    s->img_out_n = req_comp;
+                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                    return 0;
+            }
+            else if (has_trans)
+            {
+                // non-paletted image with tRNS -> source image has (constant) alpha
+                ++s->img_n;
+            }
+            STBI_FREE(z->expanded);
+            z->expanded = NULL;
+            return 1;
+        }
 
-            default:
-                // if critical, fail
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if((c.type & (1 << 29)) == 0)
-                {
+        default:
+            // if critical, fail
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0)
+            {
 #ifndef STBI_NO_FAILURE_STRINGS
-                    // not threadsafe
-                    static char invalid_chunk[] = "XXXX PNG chunk not known";
-                    invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
-                    invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
-                    invalid_chunk[2] = STBI__BYTECAST(c.type >> 8);
-                    invalid_chunk[3] = STBI__BYTECAST(c.type >> 0);
+                // not threadsafe
+                static char invalid_chunk[] = "XXXX PNG chunk not known";
+                invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+                invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+                invalid_chunk[2] = STBI__BYTECAST(c.type >> 8);
+                invalid_chunk[3] = STBI__BYTECAST(c.type >> 0);
 #endif
-                    return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
-                }
-                stbi__skip(s, c.length);
-                break;
+                return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
         }
         // end of PNG chunk, read and skip CRC
         stbi__get32be(s);
@@ -5501,31 +5489,30 @@ static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp)
 static void* stbi__do_png(stbi__png* p, int* x, int* y, int* n, int req_comp, stbi__result_info* ri)
 {
     void* result = NULL;
-    if(req_comp < 0 || req_comp > 4)
+    if (req_comp < 0 || req_comp > 4)
         return stbi__errpuc("bad req_comp", "Internal error");
-    if(stbi__parse_png_file(p, STBI__SCAN_load, req_comp))
+    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp))
     {
-        if(p->depth < 8)
+        if (p->depth < 8)
             ri->bits_per_channel = 8;
         else
             ri->bits_per_channel = p->depth;
         result = p->out;
         p->out = NULL;
-        if(req_comp && req_comp != p->s->img_out_n)
+        if (req_comp && req_comp != p->s->img_out_n)
         {
-            if(ri->bits_per_channel == 8)
-                result =
-                    stbi__convert_format(( unsigned char* )result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+            if (ri->bits_per_channel == 8)
+                result = stbi__convert_format((unsigned char*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
             else
-                result = stbi__convert_format16(( stbi__uint16* )result, p->s->img_out_n, req_comp, p->s->img_x,
+                result = stbi__convert_format16((stbi__uint16*)result, p->s->img_out_n, req_comp, p->s->img_x,
                                                 p->s->img_y);
             p->s->img_out_n = req_comp;
-            if(result == NULL)
+            if (result == NULL)
                 return result;
         }
         *x = p->s->img_x;
         *y = p->s->img_y;
-        if(n)
+        if (n)
             *n = p->s->img_n;
     }
     STBI_FREE(p->out);
@@ -5555,16 +5542,16 @@ static int stbi__png_test(stbi__context* s)
 
 static int stbi__png_info_raw(stbi__png* p, int* x, int* y, int* comp)
 {
-    if(!stbi__parse_png_file(p, STBI__SCAN_header, 0))
+    if (!stbi__parse_png_file(p, STBI__SCAN_header, 0))
     {
         stbi__rewind(p->s);
         return 0;
     }
-    if(x)
+    if (x)
         *x = p->s->img_x;
-    if(y)
+    if (y)
         *y = p->s->img_y;
-    if(comp)
+    if (comp)
         *comp = p->s->img_n;
     return 1;
 }
@@ -5580,9 +5567,9 @@ static int stbi__png_is16(stbi__context* s)
 {
     stbi__png p;
     p.s = s;
-    if(!stbi__png_info_raw(&p, NULL, NULL, NULL))
+    if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
         return 0;
-    if(p.depth != 16)
+    if (p.depth != 16)
     {
         stbi__rewind(p.s);
         return 0;
@@ -5598,14 +5585,14 @@ static int stbi__bmp_test_raw(stbi__context* s)
 {
     int r;
     int sz;
-    if(stbi__get8(s) != 'B')
+    if (stbi__get8(s) != 'B')
         return 0;
-    if(stbi__get8(s) != 'M')
+    if (stbi__get8(s) != 'M')
         return 0;
-    stbi__get32le(s);    // discard filesize
-    stbi__get16le(s);    // discard reserved
-    stbi__get16le(s);    // discard reserved
-    stbi__get32le(s);    // discard data offset
+    stbi__get32le(s); // discard filesize
+    stbi__get16le(s); // discard reserved
+    stbi__get16le(s); // discard reserved
+    stbi__get32le(s); // discard data offset
     sz = stbi__get32le(s);
     r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
     return r;
@@ -5622,28 +5609,28 @@ static int stbi__bmp_test(stbi__context* s)
 static int stbi__high_bit(unsigned int z)
 {
     int n = 0;
-    if(z == 0)
+    if (z == 0)
         return -1;
-    if(z >= 0x10000)
+    if (z >= 0x10000)
         n += 16, z >>= 16;
-    if(z >= 0x00100)
+    if (z >= 0x00100)
         n += 8, z >>= 8;
-    if(z >= 0x00010)
+    if (z >= 0x00010)
         n += 4, z >>= 4;
-    if(z >= 0x00004)
+    if (z >= 0x00004)
         n += 2, z >>= 2;
-    if(z >= 0x00002)
+    if (z >= 0x00002)
         n += 1, z >>= 1;
     return n;
 }
 
 static int stbi__bitcount(unsigned int a)
 {
-    a = (a & 0x55555555) + ((a >> 1) & 0x55555555);    // max 2
-    a = (a & 0x33333333) + ((a >> 2) & 0x33333333);    // max 4
-    a = (a + (a >> 4)) & 0x0f0f0f0f;    // max 8 per 4, now 8 bits
-    a = (a + (a >> 8));    // max 16 per 8 bits
-    a = (a + (a >> 16));    // max 32 per 8 bits
+    a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2
+    a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4
+    a = (a + (a >> 4)) & 0x0f0f0f0f;                // max 8 per 4, now 8 bits
+    a = (a + (a >> 8));                             // max 16 per 8 bits
+    a = (a + (a >> 16));                            // max 32 per 8 bits
     return a & 0xff;
 }
 
@@ -5664,16 +5651,24 @@ static int stbi__shiftsigned(int v, int shift, int bits)
         0x01 /*0b00000001*/,
     };
     static unsigned int shift_table[9] = {
-        0, 0, 0, 1, 0, 2, 4, 6, 0,
+        0,
+        0,
+        0,
+        1,
+        0,
+        2,
+        4,
+        6,
+        0,
     };
-    if(shift < 0)
+    if (shift < 0)
         v <<= -shift;
     else
         v >>= shift;
     STBI_ASSERT(v >= 0 && v < 256);
     v >>= (8 - bits);
     STBI_ASSERT(bits >= 0 && bits <= 8);
-    return ( int )(( unsigned )v * mul_table[bits]) >> shift_table[bits];
+    return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits];
 }
 
 typedef struct
@@ -5685,18 +5680,18 @@ typedef struct
 static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info)
 {
     int hsz;
-    if(stbi__get8(s) != 'B' || stbi__get8(s) != 'M')
+    if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M')
         return stbi__errpuc("not BMP", "Corrupt BMP");
-    stbi__get32le(s);    // discard filesize
-    stbi__get16le(s);    // discard reserved
-    stbi__get16le(s);    // discard reserved
+    stbi__get32le(s); // discard filesize
+    stbi__get16le(s); // discard reserved
+    stbi__get16le(s); // discard reserved
     info->offset = stbi__get32le(s);
     info->hsz = hsz = stbi__get32le(s);
     info->mr = info->mg = info->mb = info->ma = 0;
 
-    if(hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124)
+    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124)
         return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
-    if(hsz == 12)
+    if (hsz == 12)
     {
         s->img_x = stbi__get16le(s);
         s->img_y = stbi__get16le(s);
@@ -5706,39 +5701,39 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info)
         s->img_x = stbi__get32le(s);
         s->img_y = stbi__get32le(s);
     }
-    if(stbi__get16le(s) != 1)
+    if (stbi__get16le(s) != 1)
         return stbi__errpuc("bad BMP", "bad BMP");
     info->bpp = stbi__get16le(s);
-    if(hsz != 12)
+    if (hsz != 12)
     {
         int compress = stbi__get32le(s);
-        if(compress == 1 || compress == 2)
+        if (compress == 1 || compress == 2)
             return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
-        stbi__get32le(s);    // discard sizeof
-        stbi__get32le(s);    // discard hres
-        stbi__get32le(s);    // discard vres
-        stbi__get32le(s);    // discard colorsused
-        stbi__get32le(s);    // discard max important
-        if(hsz == 40 || hsz == 56)
-        {
-            if(hsz == 56)
+        stbi__get32le(s); // discard sizeof
+        stbi__get32le(s); // discard hres
+        stbi__get32le(s); // discard vres
+        stbi__get32le(s); // discard colorsused
+        stbi__get32le(s); // discard max important
+        if (hsz == 40 || hsz == 56)
+        {
+            if (hsz == 56)
             {
                 stbi__get32le(s);
                 stbi__get32le(s);
                 stbi__get32le(s);
                 stbi__get32le(s);
             }
-            if(info->bpp == 16 || info->bpp == 32)
+            if (info->bpp == 16 || info->bpp == 32)
             {
-                if(compress == 0)
+                if (compress == 0)
                 {
-                    if(info->bpp == 32)
+                    if (info->bpp == 32)
                     {
                         info->mr = 0xffu << 16;
                         info->mg = 0xffu << 8;
                         info->mb = 0xffu << 0;
                         info->ma = 0xffu << 24;
-                        info->all_a = 0;    // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+                        info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
                     }
                     else
                     {
@@ -5747,13 +5742,13 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info)
                         info->mb = 31u << 0;
                     }
                 }
-                else if(compress == 3)
+                else if (compress == 3)
                 {
                     info->mr = stbi__get32le(s);
                     info->mg = stbi__get32le(s);
                     info->mb = stbi__get32le(s);
                     // not documented, but generated by photoshop and handled by mspaint
-                    if(info->mr == info->mg && info->mg == info->mb)
+                    if (info->mr == info->mg && info->mg == info->mb)
                     {
                         // ?!?!?
                         return stbi__errpuc("bad BMP", "bad BMP");
@@ -5766,25 +5761,25 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info)
         else
         {
             int i;
-            if(hsz != 108 && hsz != 124)
+            if (hsz != 108 && hsz != 124)
                 return stbi__errpuc("bad BMP", "bad BMP");
             info->mr = stbi__get32le(s);
             info->mg = stbi__get32le(s);
             info->mb = stbi__get32le(s);
             info->ma = stbi__get32le(s);
-            stbi__get32le(s);    // discard color space
-            for(i = 0; i < 12; ++i)
-                stbi__get32le(s);    // discard color space parameters
-            if(hsz == 124)
+            stbi__get32le(s); // discard color space
+            for (i = 0; i < 12; ++i)
+                stbi__get32le(s); // discard color space parameters
+            if (hsz == 124)
             {
-                stbi__get32le(s);    // discard rendering intent
-                stbi__get32le(s);    // discard offset of profile data
-                stbi__get32le(s);    // discard size of profile data
-                stbi__get32le(s);    // discard reserved
+                stbi__get32le(s); // discard rendering intent
+                stbi__get32le(s); // discard offset of profile data
+                stbi__get32le(s); // discard size of profile data
+                stbi__get32le(s); // discard reserved
             }
         }
     }
-    return ( void* )1;
+    return (void*)1;
 }
 
 static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri)
@@ -5798,11 +5793,11 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
     STBI_NOTUSED(ri);
 
     info.all_a = 255;
-    if(stbi__bmp_parse_header(s, &info) == NULL)
-        return NULL;    // error code already set
+    if (stbi__bmp_parse_header(s, &info) == NULL)
+        return NULL; // error code already set
 
-    flip_vertically = (( int )s->img_y) > 0;
-    s->img_y = abs(( int )s->img_y);
+    flip_vertically = ((int)s->img_y) > 0;
+    s->img_y = abs((int)s->img_y);
 
     mr = info.mr;
     mg = info.mg;
@@ -5810,53 +5805,53 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
     ma = info.ma;
     all_a = info.all_a;
 
-    if(info.hsz == 12)
+    if (info.hsz == 12)
     {
-        if(info.bpp < 24)
+        if (info.bpp < 24)
             psize = (info.offset - 14 - 24) / 3;
     }
     else
     {
-        if(info.bpp < 16)
+        if (info.bpp < 16)
             psize = (info.offset - 14 - info.hsz) >> 2;
     }
 
     s->img_n = ma ? 4 : 3;
-    if(req_comp && req_comp >= 3)    // we can directly decode 3 or 4
+    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
         target = req_comp;
     else
-        target = s->img_n;    // if they want monochrome, we'll post-convert
+        target = s->img_n; // if they want monochrome, we'll post-convert
 
     // sanity-check size
-    if(!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+    if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
         return stbi__errpuc("too large", "Corrupt BMP");
 
-    out = ( stbi_uc* )stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
-    if(!out)
+    out = (stbi_uc*)stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+    if (!out)
         return stbi__errpuc("outofmem", "Out of memory");
-    if(info.bpp < 16)
+    if (info.bpp < 16)
     {
         int z = 0;
-        if(psize == 0 || psize > 256)
+        if (psize == 0 || psize > 256)
         {
             STBI_FREE(out);
             return stbi__errpuc("invalid", "Corrupt BMP");
         }
-        for(i = 0; i < psize; ++i)
+        for (i = 0; i < psize; ++i)
         {
             pal[i][2] = stbi__get8(s);
             pal[i][1] = stbi__get8(s);
             pal[i][0] = stbi__get8(s);
-            if(info.hsz != 12)
+            if (info.hsz != 12)
                 stbi__get8(s);
             pal[i][3] = 255;
         }
         stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
-        if(info.bpp == 1)
+        if (info.bpp == 1)
             width = (s->img_x + 7) >> 3;
-        else if(info.bpp == 4)
+        else if (info.bpp == 4)
             width = (s->img_x + 1) >> 1;
-        else if(info.bpp == 8)
+        else if (info.bpp == 8)
             width = s->img_x;
         else
         {
@@ -5864,18 +5859,18 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
             return stbi__errpuc("bad bpp", "Corrupt BMP");
         }
         pad = (-width) & 3;
-        if(info.bpp == 1)
+        if (info.bpp == 1)
         {
-            for(j = 0; j < ( int )s->img_y; ++j)
+            for (j = 0; j < (int)s->img_y; ++j)
             {
                 int bit_offset = 7, v = stbi__get8(s);
-                for(i = 0; i < ( int )s->img_x; ++i)
+                for (i = 0; i < (int)s->img_x; ++i)
                 {
                     int color = (v >> bit_offset) & 0x1;
                     out[z++] = pal[color][0];
                     out[z++] = pal[color][1];
                     out[z++] = pal[color][2];
-                    if((--bit_offset) < 0)
+                    if ((--bit_offset) < 0)
                     {
                         bit_offset = 7;
                         v = stbi__get8(s);
@@ -5886,12 +5881,12 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
         }
         else
         {
-            for(j = 0; j < ( int )s->img_y; ++j)
+            for (j = 0; j < (int)s->img_y; ++j)
             {
-                for(i = 0; i < ( int )s->img_x; i += 2)
+                for (i = 0; i < (int)s->img_x; i += 2)
                 {
                     int v = stbi__get8(s), v2 = 0;
-                    if(info.bpp == 4)
+                    if (info.bpp == 4)
                     {
                         v2 = v & 15;
                         v >>= 4;
@@ -5899,15 +5894,15 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
                     out[z++] = pal[v][0];
                     out[z++] = pal[v][1];
                     out[z++] = pal[v][2];
-                    if(target == 4)
+                    if (target == 4)
                         out[z++] = 255;
-                    if(i + 1 == ( int )s->img_x)
+                    if (i + 1 == (int)s->img_x)
                         break;
                     v = (info.bpp == 8) ? stbi__get8(s) : v2;
                     out[z++] = pal[v][0];
                     out[z++] = pal[v][1];
                     out[z++] = pal[v][2];
-                    if(target == 4)
+                    if (target == 4)
                         out[z++] = 255;
                 }
                 stbi__skip(s, pad);
@@ -5920,25 +5915,25 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
         int z = 0;
         int easy = 0;
         stbi__skip(s, info.offset - 14 - info.hsz);
-        if(info.bpp == 24)
+        if (info.bpp == 24)
             width = 3 * s->img_x;
-        else if(info.bpp == 16)
+        else if (info.bpp == 16)
             width = 2 * s->img_x;
         else /* bpp = 32 and pad = 0 */
             width = 0;
         pad = (-width) & 3;
-        if(info.bpp == 24)
+        if (info.bpp == 24)
         {
             easy = 1;
         }
-        else if(info.bpp == 32)
+        else if (info.bpp == 32)
         {
-            if(mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
                 easy = 2;
         }
-        if(!easy)
+        if (!easy)
         {
-            if(!mr || !mg || !mb)
+            if (!mr || !mg || !mb)
             {
                 STBI_FREE(out);
                 return stbi__errpuc("bad masks", "Corrupt BMP");
@@ -5953,11 +5948,11 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
             ashift = stbi__high_bit(ma) - 7;
             acount = stbi__bitcount(ma);
         }
-        for(j = 0; j < ( int )s->img_y; ++j)
+        for (j = 0; j < (int)s->img_y; ++j)
         {
-            if(easy)
+            if (easy)
             {
-                for(i = 0; i < ( int )s->img_x; ++i)
+                for (i = 0; i < (int)s->img_x; ++i)
                 {
                     unsigned char a;
                     out[z + 2] = stbi__get8(s);
@@ -5966,23 +5961,23 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
                     z += 3;
                     a = (easy == 2 ? stbi__get8(s) : 255);
                     all_a |= a;
-                    if(target == 4)
+                    if (target == 4)
                         out[z++] = a;
                 }
             }
             else
             {
                 int bpp = info.bpp;
-                for(i = 0; i < ( int )s->img_x; ++i)
+                for (i = 0; i < (int)s->img_x; ++i)
                 {
-                    stbi__uint32 v = (bpp == 16 ? ( stbi__uint32 )stbi__get16le(s) : stbi__get32le(s));
+                    stbi__uint32 v = (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s));
                     unsigned int a;
                     out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
                     out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
                     out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
                     a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
                     all_a |= a;
-                    if(target == 4)
+                    if (target == 4)
                         out[z++] = STBI__BYTECAST(a);
                 }
             }
@@ -5991,34 +5986,34 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
     }
 
     // if alpha channel is all 0s, replace with all 255s
-    if(target == 4 && all_a == 0)
-        for(i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4)
+    if (target == 4 && all_a == 0)
+        for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4)
             out[i] = 255;
 
-    if(flip_vertically)
+    if (flip_vertically)
     {
         stbi_uc t;
-        for(j = 0; j<( int )s->img_y>> 1; ++j)
+        for (j = 0; j < (int)s->img_y >> 1; ++j)
         {
             stbi_uc* p1 = out + j * s->img_x * target;
             stbi_uc* p2 = out + (s->img_y - 1 - j) * s->img_x * target;
-            for(i = 0; i < ( int )s->img_x * target; ++i)
+            for (i = 0; i < (int)s->img_x * target; ++i)
             {
                 t = p1[i], p1[i] = p2[i], p2[i] = t;
             }
         }
     }
 
-    if(req_comp && req_comp != target)
+    if (req_comp && req_comp != target)
     {
         out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
-        if(out == NULL)
-            return out;    // stbi__convert_format frees input on failure
+        if (out == NULL)
+            return out; // stbi__convert_format frees input on failure
     }
 
     *x = s->img_x;
     *y = s->img_y;
-    if(comp)
+    if (comp)
         *comp = s->img_n;
     return out;
 }
@@ -6031,25 +6026,25 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
 static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
 {
     // only RGB or RGBA (incl. 16bit) or grey allowed
-    if(is_rgb16)
+    if (is_rgb16)
         *is_rgb16 = 0;
-    switch(bits_per_pixel)
-    {
-        case 8:
-            return STBI_grey;
-        case 16:
-            if(is_grey)
-                return STBI_grey_alpha;
-            // fallthrough
-        case 15:
-            if(is_rgb16)
-                *is_rgb16 = 1;
-            return STBI_rgb;
-        case 24:    // fallthrough
-        case 32:
-            return bits_per_pixel / 8;
-        default:
-            return 0;
+    switch (bits_per_pixel)
+    {
+    case 8:
+        return STBI_grey;
+    case 16:
+        if (is_grey)
+            return STBI_grey_alpha;
+        // fallthrough
+    case 15:
+        if (is_rgb16)
+            *is_rgb16 = 1;
+        return STBI_rgb;
+    case 24: // fallthrough
+    case 32:
+        return bits_per_pixel / 8;
+    default:
+        return 0;
     }
 }
 
@@ -6057,58 +6052,58 @@ static int stbi__tga_info(stbi__context* s, int* x, int* y, int* comp)
 {
     int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
     int sz, tga_colormap_type;
-    stbi__get8(s);    // discard Offset
-    tga_colormap_type = stbi__get8(s);    // colormap type
-    if(tga_colormap_type > 1)
+    stbi__get8(s);                     // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if (tga_colormap_type > 1)
     {
         stbi__rewind(s);
-        return 0;    // only RGB or indexed allowed
+        return 0; // only RGB or indexed allowed
     }
-    tga_image_type = stbi__get8(s);    // image type
-    if(tga_colormap_type == 1)
-    {    // colormapped (paletted) image
-        if(tga_image_type != 1 && tga_image_type != 9)
+    tga_image_type = stbi__get8(s); // image type
+    if (tga_colormap_type == 1)
+    { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9)
         {
             stbi__rewind(s);
             return 0;
         }
-        stbi__skip(s, 4);    // skip index of first colormap entry and number of entries
-        sz = stbi__get8(s);    //   check bits per palette color entry
-        if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+        stbi__skip(s, 4);   // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s); //   check bits per palette color entry
+        if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
         {
             stbi__rewind(s);
             return 0;
         }
-        stbi__skip(s, 4);    // skip image x and y origin
+        stbi__skip(s, 4); // skip image x and y origin
         tga_colormap_bpp = sz;
     }
     else
-    {    // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
-        if((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11))
+    { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11))
         {
             stbi__rewind(s);
-            return 0;    // only RGB or grey allowed, +/- RLE
+            return 0; // only RGB or grey allowed, +/- RLE
         }
-        stbi__skip(s, 9);    // skip colormap specification and image x/y origin
+        stbi__skip(s, 9); // skip colormap specification and image x/y origin
         tga_colormap_bpp = 0;
     }
     tga_w = stbi__get16le(s);
-    if(tga_w < 1)
+    if (tga_w < 1)
     {
         stbi__rewind(s);
-        return 0;    // test width
+        return 0; // test width
     }
     tga_h = stbi__get16le(s);
-    if(tga_h < 1)
+    if (tga_h < 1)
     {
         stbi__rewind(s);
-        return 0;    // test height
+        return 0; // test height
     }
-    tga_bits_per_pixel = stbi__get8(s);    // bits per pixel
-    stbi__get8(s);    // ignore alpha bits
-    if(tga_colormap_bpp != 0)
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s);                      // ignore alpha bits
+    if (tga_colormap_bpp != 0)
     {
-        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16))
+        if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16))
         {
             // when using a colormap, tga_bits_per_pixel is the size of the indexes
             // I don't think anything but 8 or 16bit indexes makes sense
@@ -6121,56 +6116,56 @@ static int stbi__tga_info(stbi__context* s, int* x, int* y, int* comp)
     {
         tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
     }
-    if(!tga_comp)
+    if (!tga_comp)
     {
         stbi__rewind(s);
         return 0;
     }
-    if(x)
+    if (x)
         *x = tga_w;
-    if(y)
+    if (y)
         *y = tga_h;
-    if(comp)
+    if (comp)
         *comp = tga_comp;
-    return 1;    // seems to have passed everything
+    return 1; // seems to have passed everything
 }
 
 static int stbi__tga_test(stbi__context* s)
 {
     int res = 0;
     int sz, tga_color_type;
-    stbi__get8(s);    //   discard Offset
-    tga_color_type = stbi__get8(s);    //   color type
-    if(tga_color_type > 1)
-        goto errorEnd;    //   only RGB or indexed allowed
-    sz = stbi__get8(s);    //   image type
-    if(tga_color_type == 1)
-    {    // colormapped (paletted) image
-        if(sz != 1 && sz != 9)
-            goto errorEnd;    // colortype 1 demands image type 1 or 9
-        stbi__skip(s, 4);    // skip index of first colormap entry and number of entries
-        sz = stbi__get8(s);    //   check bits per palette color entry
-        if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+    stbi__get8(s);                  //   discard Offset
+    tga_color_type = stbi__get8(s); //   color type
+    if (tga_color_type > 1)
+        goto errorEnd;  //   only RGB or indexed allowed
+    sz = stbi__get8(s); //   image type
+    if (tga_color_type == 1)
+    { // colormapped (paletted) image
+        if (sz != 1 && sz != 9)
+            goto errorEnd;  // colortype 1 demands image type 1 or 9
+        stbi__skip(s, 4);   // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s); //   check bits per palette color entry
+        if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
             goto errorEnd;
-        stbi__skip(s, 4);    // skip image x and y origin
+        stbi__skip(s, 4); // skip image x and y origin
     }
     else
-    {    // "normal" image w/o colormap
-        if((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11))
-            goto errorEnd;    // only RGB or grey allowed, +/- RLE
-        stbi__skip(s, 9);    // skip colormap specification and image x/y origin
-    }
-    if(stbi__get16le(s) < 1)
-        goto errorEnd;    //   test width
-    if(stbi__get16le(s) < 1)
-        goto errorEnd;    //   test height
-    sz = stbi__get8(s);    //   bits per pixel
-    if((tga_color_type == 1) && (sz != 8) && (sz != 16))
-        goto errorEnd;    // for colormapped images, bpp is size of an index
-    if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+    { // "normal" image w/o colormap
+        if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11))
+            goto errorEnd; // only RGB or grey allowed, +/- RLE
+        stbi__skip(s, 9);  // skip colormap specification and image x/y origin
+    }
+    if (stbi__get16le(s) < 1)
+        goto errorEnd; //   test width
+    if (stbi__get16le(s) < 1)
+        goto errorEnd;  //   test height
+    sz = stbi__get8(s); //   bits per pixel
+    if ((tga_color_type == 1) && (sz != 8) && (sz != 16))
+        goto errorEnd; // for colormapped images, bpp is size of an index
+    if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
         goto errorEnd;
 
-    res = 1;    // if we got this far, everything's good and we can return 1 instead of 0
+    res = 1; // if we got this far, everything's good and we can return 1 instead of 0
 
 errorEnd:
     stbi__rewind(s);
@@ -6180,7 +6175,7 @@ static int stbi__tga_test(stbi__context* s)
 // read 16bit value and convert to 24bit RGB
 static void stbi__tga_read_rgb16(stbi__context* s, stbi_uc* out)
 {
-    stbi__uint16 px = ( stbi__uint16 )stbi__get16le(s);
+    stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
     stbi__uint16 fiveBitMask = 31;
     // we have 3 channels with 5bits each
     int r = (px >> 10) & fiveBitMask;
@@ -6226,7 +6221,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
     STBI_NOTUSED(ri);
 
     //   do a tiny bit of precessing
-    if(tga_image_type >= 8)
+    if (tga_image_type >= 8)
     {
         tga_image_type -= 8;
         tga_is_RLE = 1;
@@ -6234,33 +6229,33 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
     tga_inverted = 1 - ((tga_inverted >> 5) & 1);
 
     //   If I'm paletted, then I'll use the number of bits from the palette
-    if(tga_indexed)
+    if (tga_indexed)
         tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
     else
         tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
 
-    if(!tga_comp)    // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+    if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
         return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
 
     //   tga info
     *x = tga_width;
     *y = tga_height;
-    if(comp)
+    if (comp)
         *comp = tga_comp;
 
-    if(!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+    if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
         return stbi__errpuc("too large", "Corrupt TGA");
 
-    tga_data = ( unsigned char* )stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
-    if(!tga_data)
+    tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+    if (!tga_data)
         return stbi__errpuc("outofmem", "Out of memory");
 
     // skip to the data's starting position (offset usually = 0)
     stbi__skip(s, tga_offset);
 
-    if(!tga_indexed && !tga_is_RLE && !tga_rgb16)
+    if (!tga_indexed && !tga_is_RLE && !tga_rgb16)
     {
-        for(i = 0; i < tga_height; ++i)
+        for (i = 0; i < tga_height; ++i)
         {
             int row = tga_inverted ? tga_height - i - 1 : i;
             stbi_uc* tga_row = tga_data + row * tga_width * tga_comp;
@@ -6270,28 +6265,28 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
     else
     {
         //   do I need to load a palette?
-        if(tga_indexed)
+        if (tga_indexed)
         {
             //   any data to skip? (offset usually = 0)
             stbi__skip(s, tga_palette_start);
             //   load the palette
-            tga_palette = ( unsigned char* )stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
-            if(!tga_palette)
+            tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+            if (!tga_palette)
             {
                 STBI_FREE(tga_data);
                 return stbi__errpuc("outofmem", "Out of memory");
             }
-            if(tga_rgb16)
+            if (tga_rgb16)
             {
                 stbi_uc* pal_entry = tga_palette;
                 STBI_ASSERT(tga_comp == STBI_rgb);
-                for(i = 0; i < tga_palette_len; ++i)
+                for (i = 0; i < tga_palette_len; ++i)
                 {
                     stbi__tga_read_rgb16(s, pal_entry);
                     pal_entry += tga_comp;
                 }
             }
-            else if(!stbi__getn(s, tga_palette, tga_palette_len * tga_comp))
+            else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp))
             {
                 STBI_FREE(tga_data);
                 STBI_FREE(tga_palette);
@@ -6299,12 +6294,12 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
             }
         }
         //   load the data
-        for(i = 0; i < tga_width * tga_height; ++i)
+        for (i = 0; i < tga_width * tga_height; ++i)
         {
             //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
-            if(tga_is_RLE)
+            if (tga_is_RLE)
             {
-                if(RLE_count == 0)
+                if (RLE_count == 0)
                 {
                     //   yep, get the next byte as a RLE command
                     int RLE_cmd = stbi__get8(s);
@@ -6312,7 +6307,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
                     RLE_repeating = RLE_cmd >> 7;
                     read_next_pixel = 1;
                 }
-                else if(!RLE_repeating)
+                else if (!RLE_repeating)
                 {
                     read_next_pixel = 1;
                 }
@@ -6322,25 +6317,25 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
                 read_next_pixel = 1;
             }
             //   OK, if I need to read a pixel, do it now
-            if(read_next_pixel)
+            if (read_next_pixel)
             {
                 //   load however much data we did have
-                if(tga_indexed)
+                if (tga_indexed)
                 {
                     // read in index, then perform the lookup
                     int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
-                    if(pal_idx >= tga_palette_len)
+                    if (pal_idx >= tga_palette_len)
                     {
                         // invalid index
                         pal_idx = 0;
                     }
                     pal_idx *= tga_comp;
-                    for(j = 0; j < tga_comp; ++j)
+                    for (j = 0; j < tga_comp; ++j)
                     {
                         raw_data[j] = tga_palette[pal_idx + j];
                     }
                 }
-                else if(tga_rgb16)
+                else if (tga_rgb16)
                 {
                     STBI_ASSERT(tga_comp == STBI_rgb);
                     stbi__tga_read_rgb16(s, raw_data);
@@ -6348,30 +6343,30 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
                 else
                 {
                     //   read in the data raw
-                    for(j = 0; j < tga_comp; ++j)
+                    for (j = 0; j < tga_comp; ++j)
                     {
                         raw_data[j] = stbi__get8(s);
                     }
                 }
                 //   clear the reading flag for the next pixel
                 read_next_pixel = 0;
-            }    // end of reading a pixel
+            } // end of reading a pixel
 
             // copy data
-            for(j = 0; j < tga_comp; ++j)
+            for (j = 0; j < tga_comp; ++j)
                 tga_data[i * tga_comp + j] = raw_data[j];
 
             //   in case we're in RLE mode, keep counting down
             --RLE_count;
         }
         //   do I need to invert the image?
-        if(tga_inverted)
+        if (tga_inverted)
         {
-            for(j = 0; j * 2 < tga_height; ++j)
+            for (j = 0; j * 2 < tga_height; ++j)
             {
                 int index1 = j * tga_width * tga_comp;
                 int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
-                for(i = tga_width * tga_comp; i > 0; --i)
+                for (i = tga_width * tga_comp; i > 0; --i)
                 {
                     unsigned char temp = tga_data[index1];
                     tga_data[index1] = tga_data[index2];
@@ -6382,17 +6377,17 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
             }
         }
         //   clear my palette, if I had one
-        if(tga_palette != NULL)
+        if (tga_palette != NULL)
         {
             STBI_FREE(tga_palette);
         }
     }
 
     // swap RGB - if the source data was RGB16, it already is in the right order
-    if(tga_comp >= 3 && !tga_rgb16)
+    if (tga_comp >= 3 && !tga_rgb16)
     {
         unsigned char* tga_pixel = tga_data;
-        for(i = 0; i < tga_width * tga_height; ++i)
+        for (i = 0; i < tga_width * tga_height; ++i)
         {
             unsigned char temp = tga_pixel[0];
             tga_pixel[0] = tga_pixel[2];
@@ -6402,7 +6397,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
     }
 
     // convert to target component count
-    if(req_comp && req_comp != tga_comp)
+    if (req_comp && req_comp != tga_comp)
         tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
 
     //   the things I do to get rid of an error message, and yet keep
@@ -6429,38 +6424,38 @@ static int stbi__psd_decode_rle(stbi__context* s, stbi_uc* p, int pixelCount)
     int count, nleft, len;
 
     count = 0;
-    while((nleft = pixelCount - count) > 0)
+    while ((nleft = pixelCount - count) > 0)
     {
         len = stbi__get8(s);
-        if(len == 128)
+        if (len == 128)
         {
             // No-op.
         }
-        else if(len < 128)
+        else if (len < 128)
         {
             // Copy next len+1 bytes literally.
             len++;
-            if(len > nleft)
-                return 0;    // corrupt data
+            if (len > nleft)
+                return 0; // corrupt data
             count += len;
-            while(len)
+            while (len)
             {
                 *p = stbi__get8(s);
                 p += 4;
                 len--;
             }
         }
-        else if(len > 128)
+        else if (len > 128)
         {
             stbi_uc val;
             // Next -len+1 bytes in the dest are replicated from next source byte.
             // (Interpret len as a negative 8-bit int.)
             len = 257 - len;
-            if(len > nleft)
-                return 0;    // corrupt data
+            if (len > nleft)
+                return 0; // corrupt data
             val = stbi__get8(s);
             count += len;
-            while(len)
+            while (len)
             {
                 *p = val;
                 p += 4;
@@ -6483,11 +6478,11 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     STBI_NOTUSED(ri);
 
     // Check identifier
-    if(stbi__get32be(s) != 0x38425053)    // "8BPS"
+    if (stbi__get32be(s) != 0x38425053) // "8BPS"
         return stbi__errpuc("not PSD", "Corrupt PSD image");
 
     // Check file type version.
-    if(stbi__get16be(s) != 1)
+    if (stbi__get16be(s) != 1)
         return stbi__errpuc("wrong version", "Unsupported version of PSD image");
 
     // Skip 6 reserved bytes.
@@ -6495,7 +6490,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
 
     // Read the number of channels (R, G, B, A, etc).
     channelCount = stbi__get16be(s);
-    if(channelCount < 0 || channelCount > 16)
+    if (channelCount < 0 || channelCount > 16)
         return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
 
     // Read the rows and columns of the image.
@@ -6504,7 +6499,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
 
     // Make sure the depth is 8 bits.
     bitdepth = stbi__get16be(s);
-    if(bitdepth != 8 && bitdepth != 16)
+    if (bitdepth != 8 && bitdepth != 16)
         return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
 
     // Make sure the color mode is RGB.
@@ -6517,7 +6512,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     //   7: Multichannel
     //   8: Duotone
     //   9: Lab color
-    if(stbi__get16be(s) != 3)
+    if (stbi__get16be(s) != 3)
         return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
 
     // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
@@ -6534,24 +6529,24 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     //   0: no compression
     //   1: RLE compressed
     compression = stbi__get16be(s);
-    if(compression > 1)
+    if (compression > 1)
         return stbi__errpuc("bad compression", "PSD has an unknown compression format");
 
     // Check size
-    if(!stbi__mad3sizes_valid(4, w, h, 0))
+    if (!stbi__mad3sizes_valid(4, w, h, 0))
         return stbi__errpuc("too large", "Corrupt PSD");
 
     // Create the destination image.
 
-    if(!compression && bitdepth == 16 && bpc == 16)
+    if (!compression && bitdepth == 16 && bpc == 16)
     {
-        out = ( stbi_uc* )stbi__malloc_mad3(8, w, h, 0);
+        out = (stbi_uc*)stbi__malloc_mad3(8, w, h, 0);
         ri->bits_per_channel = 16;
     }
     else
-        out = ( stbi_uc* )stbi__malloc(4 * (size_t)w * h);
+        out = (stbi_uc*)stbi__malloc(4 * (size_t)w * h);
 
-    if(!out)
+    if (!out)
         return stbi__errpuc("outofmem", "Out of memory");
     pixelCount = w * h;
 
@@ -6559,7 +6554,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     // memset( out, 0, pixelCount * 4 );
 
     // Finally, the image data.
-    if(compression)
+    if (compression)
     {
         // RLE as used by .PSD and .TIFF
         // Loop until you get the number of unpacked bytes you are expecting:
@@ -6574,21 +6569,21 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
         stbi__skip(s, h * channelCount * 2);
 
         // Read the RLE data by channel.
-        for(channel = 0; channel < 4; channel++)
+        for (channel = 0; channel < 4; channel++)
         {
             stbi_uc* p;
 
             p = out + channel;
-            if(channel >= channelCount)
+            if (channel >= channelCount)
             {
                 // Fill this channel with default data.
-                for(i = 0; i < pixelCount; i++, p += 4)
+                for (i = 0; i < pixelCount; i++, p += 4)
                     *p = (channel == 3 ? 255 : 0);
             }
             else
             {
                 // Read the RLE data.
-                if(!stbi__psd_decode_rle(s, p, pixelCount))
+                if (!stbi__psd_decode_rle(s, p, pixelCount))
                 {
                     STBI_FREE(out);
                     return stbi__errpuc("corrupt", "bad RLE data");
@@ -6602,45 +6597,45 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
         // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
 
         // Read the data by channel.
-        for(channel = 0; channel < 4; channel++)
+        for (channel = 0; channel < 4; channel++)
         {
-            if(channel >= channelCount)
+            if (channel >= channelCount)
             {
                 // Fill this channel with default data.
-                if(bitdepth == 16 && bpc == 16)
+                if (bitdepth == 16 && bpc == 16)
                 {
-                    stbi__uint16* q = (( stbi__uint16* )out) + channel;
+                    stbi__uint16* q = ((stbi__uint16*)out) + channel;
                     stbi__uint16 val = channel == 3 ? 65535 : 0;
-                    for(i = 0; i < pixelCount; i++, q += 4)
+                    for (i = 0; i < pixelCount; i++, q += 4)
                         *q = val;
                 }
                 else
                 {
                     stbi_uc* p = out + channel;
                     stbi_uc val = channel == 3 ? 255 : 0;
-                    for(i = 0; i < pixelCount; i++, p += 4)
+                    for (i = 0; i < pixelCount; i++, p += 4)
                         *p = val;
                 }
             }
             else
             {
-                if(ri->bits_per_channel == 16)
-                {    // output bpc
-                    stbi__uint16* q = (( stbi__uint16* )out) + channel;
-                    for(i = 0; i < pixelCount; i++, q += 4)
-                        *q = ( stbi__uint16 )stbi__get16be(s);
+                if (ri->bits_per_channel == 16)
+                { // output bpc
+                    stbi__uint16* q = ((stbi__uint16*)out) + channel;
+                    for (i = 0; i < pixelCount; i++, q += 4)
+                        *q = (stbi__uint16)stbi__get16be(s);
                 }
                 else
                 {
                     stbi_uc* p = out + channel;
-                    if(bitdepth == 16)
-                    {    // input bpc
-                        for(i = 0; i < pixelCount; i++, p += 4)
+                    if (bitdepth == 16)
+                    { // input bpc
+                        for (i = 0; i < pixelCount; i++, p += 4)
                             *p = (stbi_uc)(stbi__get16be(s) >> 8);
                     }
                     else
                     {
-                        for(i = 0; i < pixelCount; i++, p += 4)
+                        for (i = 0; i < pixelCount; i++, p += 4)
                             *p = stbi__get8(s);
                     }
                 }
@@ -6649,14 +6644,14 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     }
 
     // remove weird white matte from PSD
-    if(channelCount >= 4)
+    if (channelCount >= 4)
     {
-        if(ri->bits_per_channel == 16)
+        if (ri->bits_per_channel == 16)
         {
-            for(i = 0; i < w * h; ++i)
+            for (i = 0; i < w * h; ++i)
             {
-                stbi__uint16* pixel = ( stbi__uint16* )out + 4 * i;
-                if(pixel[3] != 0 && pixel[3] != 65535)
+                stbi__uint16* pixel = (stbi__uint16*)out + 4 * i;
+                if (pixel[3] != 0 && pixel[3] != 65535)
                 {
                     float a = pixel[3] / 65535.0f;
                     float ra = 1.0f / a;
@@ -6669,34 +6664,34 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
         }
         else
         {
-            for(i = 0; i < w * h; ++i)
+            for (i = 0; i < w * h; ++i)
             {
                 unsigned char* pixel = out + 4 * i;
-                if(pixel[3] != 0 && pixel[3] != 255)
+                if (pixel[3] != 0 && pixel[3] != 255)
                 {
                     float a = pixel[3] / 255.0f;
                     float ra = 1.0f / a;
                     float inv_a = 255.0f * (1 - ra);
-                    pixel[0] = ( unsigned char )(pixel[0] * ra + inv_a);
-                    pixel[1] = ( unsigned char )(pixel[1] * ra + inv_a);
-                    pixel[2] = ( unsigned char )(pixel[2] * ra + inv_a);
+                    pixel[0] = (unsigned char)(pixel[0] * ra + inv_a);
+                    pixel[1] = (unsigned char)(pixel[1] * ra + inv_a);
+                    pixel[2] = (unsigned char)(pixel[2] * ra + inv_a);
                 }
             }
         }
     }
 
     // convert to desired output format
-    if(req_comp && req_comp != 4)
+    if (req_comp && req_comp != 4)
     {
-        if(ri->bits_per_channel == 16)
-            out = ( stbi_uc* )stbi__convert_format16(( stbi__uint16* )out, 4, req_comp, w, h);
+        if (ri->bits_per_channel == 16)
+            out = (stbi_uc*)stbi__convert_format16((stbi__uint16*)out, 4, req_comp, w, h);
         else
             out = stbi__convert_format(out, 4, req_comp, w, h);
-        if(out == NULL)
-            return out;    // stbi__convert_format frees input on failure
+        if (out == NULL)
+            return out; // stbi__convert_format frees input on failure
     }
 
-    if(comp)
+    if (comp)
         *comp = 4;
     *y = h;
     *x = w;
@@ -6716,8 +6711,8 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
 static int stbi__pic_is4(stbi__context* s, const char* str)
 {
     int i;
-    for(i = 0; i < 4; ++i)
-        if(stbi__get8(s) != ( stbi_uc )str[i])
+    for (i = 0; i < 4; ++i)
+        if (stbi__get8(s) != (stbi_uc)str[i])
             return 0;
 
     return 1;
@@ -6727,13 +6722,13 @@ static int stbi__pic_test_core(stbi__context* s)
 {
     int i;
 
-    if(!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
+    if (!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
         return 0;
 
-    for(i = 0; i < 84; ++i)
+    for (i = 0; i < 84; ++i)
         stbi__get8(s);
 
-    if(!stbi__pic_is4(s, "PICT"))
+    if (!stbi__pic_is4(s, "PICT"))
         return 0;
 
     return 1;
@@ -6748,11 +6743,11 @@ static stbi_uc* stbi__readval(stbi__context* s, int channel, stbi_uc* dest)
 {
     int mask = 0x80, i;
 
-    for(i = 0; i < 4; ++i, mask >>= 1)
+    for (i = 0; i < 4; ++i, mask >>= 1)
     {
-        if(channel & mask)
+        if (channel & mask)
         {
-            if(stbi__at_eof(s))
+            if (stbi__at_eof(s))
                 return stbi__errpuc("bad file", "PIC file too short");
             dest[i] = stbi__get8(s);
         }
@@ -6765,8 +6760,8 @@ static void stbi__copyval(int channel, stbi_uc* dest, const stbi_uc* src)
 {
     int mask = 0x80, i;
 
-    for(i = 0; i < 4; ++i, mask >>= 1)
-        if(channel & mask)
+    for (i = 0; i < 4; ++i, mask >>= 1)
+        if (channel & mask)
             dest[i] = src[i];
 }
 
@@ -6781,7 +6776,7 @@ static stbi_uc* stbi__pic_load_core(stbi__context* s, int width, int height, int
     {
         stbi__pic_packet* packet;
 
-        if(num_packets == sizeof(packets) / sizeof(packets[0]))
+        if (num_packets == sizeof(packets) / sizeof(packets[0]))
             return stbi__errpuc("bad format", "too many packets");
 
         packet = &packets[num_packets++];
@@ -6793,103 +6788,103 @@ static stbi_uc* stbi__pic_load_core(stbi__context* s, int width, int height, int
 
         act_comp |= packet->channel;
 
-        if(stbi__at_eof(s))
+        if (stbi__at_eof(s))
             return stbi__errpuc("bad file", "file too short (reading packets)");
-        if(packet->size != 8)
+        if (packet->size != 8)
             return stbi__errpuc("bad format", "packet isn't 8bpp");
-    } while(chained);
+    } while (chained);
 
-    *comp = (act_comp & 0x10 ? 4 : 3);    // has alpha channel?
+    *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
 
-    for(y = 0; y < height; ++y)
+    for (y = 0; y < height; ++y)
     {
         int packet_idx;
 
-        for(packet_idx = 0; packet_idx < num_packets; ++packet_idx)
+        for (packet_idx = 0; packet_idx < num_packets; ++packet_idx)
         {
             stbi__pic_packet* packet = &packets[packet_idx];
             stbi_uc* dest = result + y * width * 4;
 
-            switch(packet->type)
+            switch (packet->type)
             {
-                default:
-                    return stbi__errpuc("bad format", "packet has bad compression type");
+            default:
+                return stbi__errpuc("bad format", "packet has bad compression type");
 
-                case 0:
-                {    // uncompressed
-                    int x;
+            case 0:
+            { // uncompressed
+                int x;
 
-                    for(x = 0; x < width; ++x, dest += 4)
-                        if(!stbi__readval(s, packet->channel, dest))
-                            return 0;
-                    break;
-                }
+                for (x = 0; x < width; ++x, dest += 4)
+                    if (!stbi__readval(s, packet->channel, dest))
+                        return 0;
+                break;
+            }
 
-                case 1:    // Pure RLE
-                {
-                    int left = width, i;
+            case 1: // Pure RLE
+            {
+                int left = width, i;
 
-                    while(left > 0)
-                    {
-                        stbi_uc count, value[4];
+                while (left > 0)
+                {
+                    stbi_uc count, value[4];
 
-                        count = stbi__get8(s);
-                        if(stbi__at_eof(s))
-                            return stbi__errpuc("bad file", "file too short (pure read count)");
+                    count = stbi__get8(s);
+                    if (stbi__at_eof(s))
+                        return stbi__errpuc("bad file", "file too short (pure read count)");
 
-                        if(count > left)
-                            count = ( stbi_uc )left;
+                    if (count > left)
+                        count = (stbi_uc)left;
 
-                        if(!stbi__readval(s, packet->channel, value))
-                            return 0;
+                    if (!stbi__readval(s, packet->channel, value))
+                        return 0;
 
-                        for(i = 0; i < count; ++i, dest += 4)
-                            stbi__copyval(packet->channel, dest, value);
-                        left -= count;
-                    }
+                    for (i = 0; i < count; ++i, dest += 4)
+                        stbi__copyval(packet->channel, dest, value);
+                    left -= count;
                 }
-                break;
+            }
+            break;
 
-                case 2:
-                {    // Mixed RLE
-                    int left = width;
-                    while(left > 0)
-                    {
-                        int count = stbi__get8(s), i;
-                        if(stbi__at_eof(s))
-                            return stbi__errpuc("bad file", "file too short (mixed read count)");
+            case 2:
+            { // Mixed RLE
+                int left = width;
+                while (left > 0)
+                {
+                    int count = stbi__get8(s), i;
+                    if (stbi__at_eof(s))
+                        return stbi__errpuc("bad file", "file too short (mixed read count)");
 
-                        if(count >= 128)
-                        {    // Repeated
-                            stbi_uc value[4];
+                    if (count >= 128)
+                    { // Repeated
+                        stbi_uc value[4];
 
-                            if(count == 128)
-                                count = stbi__get16be(s);
-                            else
-                                count -= 127;
-                            if(count > left)
-                                return stbi__errpuc("bad file", "scanline overrun");
+                        if (count == 128)
+                            count = stbi__get16be(s);
+                        else
+                            count -= 127;
+                        if (count > left)
+                            return stbi__errpuc("bad file", "scanline overrun");
 
-                            if(!stbi__readval(s, packet->channel, value))
-                                return 0;
+                        if (!stbi__readval(s, packet->channel, value))
+                            return 0;
 
-                            for(i = 0; i < count; ++i, dest += 4)
-                                stbi__copyval(packet->channel, dest, value);
-                        }
-                        else
-                        {    // Raw
-                            ++count;
-                            if(count > left)
-                                return stbi__errpuc("bad file", "scanline overrun");
+                        for (i = 0; i < count; ++i, dest += 4)
+                            stbi__copyval(packet->channel, dest, value);
+                    }
+                    else
+                    { // Raw
+                        ++count;
+                        if (count > left)
+                            return stbi__errpuc("bad file", "scanline overrun");
 
-                            for(i = 0; i < count; ++i, dest += 4)
-                                if(!stbi__readval(s, packet->channel, dest))
-                                    return 0;
-                        }
-                        left -= count;
+                        for (i = 0; i < count; ++i, dest += 4)
+                            if (!stbi__readval(s, packet->channel, dest))
+                                return 0;
                     }
-                    break;
+                    left -= count;
                 }
+                break;
+            }
             }
         }
     }
@@ -6903,35 +6898,35 @@ static void* stbi__pic_load(stbi__context* s, int* px, int* py, int* comp, int r
     int i, x, y, internal_comp;
     STBI_NOTUSED(ri);
 
-    if(!comp)
+    if (!comp)
         comp = &internal_comp;
 
-    for(i = 0; i < 92; ++i)
+    for (i = 0; i < 92; ++i)
         stbi__get8(s);
 
     x = stbi__get16be(s);
     y = stbi__get16be(s);
-    if(stbi__at_eof(s))
+    if (stbi__at_eof(s))
         return stbi__errpuc("bad file", "file too short (pic header)");
-    if(!stbi__mad3sizes_valid(x, y, 4, 0))
+    if (!stbi__mad3sizes_valid(x, y, 4, 0))
         return stbi__errpuc("too large", "PIC image too large to decode");
 
-    stbi__get32be(s);    // skip `ratio'
-    stbi__get16be(s);    // skip `fields'
-    stbi__get16be(s);    // skip `pad'
+    stbi__get32be(s); // skip `ratio'
+    stbi__get16be(s); // skip `fields'
+    stbi__get16be(s); // skip `pad'
 
     // intermediate buffer is RGBA
-    result = ( stbi_uc* )stbi__malloc_mad3(x, y, 4, 0);
+    result = (stbi_uc*)stbi__malloc_mad3(x, y, 4, 0);
     memset(result, 0xff, (size_t)x * y * 4);
 
-    if(!stbi__pic_load_core(s, x, y, comp, result))
+    if (!stbi__pic_load_core(s, x, y, comp, result))
     {
         STBI_FREE(result);
         result = 0;
     }
     *px = x;
     *py = y;
-    if(req_comp == 0)
+    if (req_comp == 0)
         req_comp = *comp;
     result = stbi__convert_format(result, 4, req_comp, x, y);
 
@@ -6960,8 +6955,8 @@ typedef struct
 typedef struct
 {
     int w, h;
-    stbi_uc* out;    // output buffer (always 4 components)
-    stbi_uc* background;    // The current "background" as far as a gif is concerned
+    stbi_uc* out;        // output buffer (always 4 components)
+    stbi_uc* background; // The current "background" as far as a gif is concerned
     stbi_uc* history;
     int flags, bgindex, ratio, transparent, eflags;
     stbi_uc pal[256][4];
@@ -6980,12 +6975,12 @@ typedef struct
 static int stbi__gif_test_raw(stbi__context* s)
 {
     int sz;
-    if(stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
         return 0;
     sz = stbi__get8(s);
-    if(sz != '9' && sz != '7')
+    if (sz != '9' && sz != '7')
         return 0;
-    if(stbi__get8(s) != 'a')
+    if (stbi__get8(s) != 'a')
         return 0;
     return 1;
 }
@@ -7000,7 +6995,7 @@ static int stbi__gif_test(stbi__context* s)
 static void stbi__gif_parse_colortable(stbi__context* s, stbi_uc pal[256][4], int num_entries, int transp)
 {
     int i;
-    for(i = 0; i < num_entries; ++i)
+    for (i = 0; i < num_entries; ++i)
     {
         pal[i][2] = stbi__get8(s);
         pal[i][1] = stbi__get8(s);
@@ -7012,13 +7007,13 @@ static void stbi__gif_parse_colortable(stbi__context* s, stbi_uc pal[256][4], in
 static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_info)
 {
     stbi_uc version;
-    if(stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
         return stbi__err("not GIF", "Corrupt GIF");
 
     version = stbi__get8(s);
-    if(version != '7' && version != '9')
+    if (version != '7' && version != '9')
         return stbi__err("not GIF", "Corrupt GIF");
-    if(stbi__get8(s) != 'a')
+    if (stbi__get8(s) != 'a')
         return stbi__err("not GIF", "Corrupt GIF");
 
     stbi__g_failure_reason = "";
@@ -7029,13 +7024,13 @@ static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_in
     g->ratio = stbi__get8(s);
     g->transparent = -1;
 
-    if(comp != 0)
-        *comp = 4;    // can't actually tell whether it's 3 or 4 until we parse the comments
+    if (comp != 0)
+        *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments
 
-    if(is_info)
+    if (is_info)
         return 1;
 
-    if(g->flags & 0x80)
+    if (g->flags & 0x80)
         stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1);
 
     return 1;
@@ -7043,16 +7038,16 @@ static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_in
 
 static int stbi__gif_info_raw(stbi__context* s, int* x, int* y, int* comp)
 {
-    stbi__gif* g = ( stbi__gif* )stbi__malloc(sizeof(stbi__gif));
-    if(!stbi__gif_header(s, g, comp, 1))
+    stbi__gif* g = (stbi__gif*)stbi__malloc(sizeof(stbi__gif));
+    if (!stbi__gif_header(s, g, comp, 1))
     {
         STBI_FREE(g);
         stbi__rewind(s);
         return 0;
     }
-    if(x)
+    if (x)
         *x = g->w;
-    if(y)
+    if (y)
         *y = g->h;
     STBI_FREE(g);
     return 1;
@@ -7065,10 +7060,10 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code)
 
     // recurse to decode the prefixes, since the linked-list is backwards,
     // and working backwards through an interleaved image would be nasty
-    if(g->codes[code].prefix >= 0)
+    if (g->codes[code].prefix >= 0)
         stbi__out_gif_code(g, g->codes[code].prefix);
 
-    if(g->cur_y >= g->max_y)
+    if (g->cur_y >= g->max_y)
         return;
 
     idx = g->cur_x + g->cur_y;
@@ -7076,8 +7071,8 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code)
     g->history[idx / 4] = 1;
 
     c = &g->color_table[g->codes[code].suffix * 4];
-    if(c[3] > 128)
-    {    // don't render transparent pixels;
+    if (c[3] > 128)
+    { // don't render transparent pixels;
         p[0] = c[2];
         p[1] = c[1];
         p[2] = c[0];
@@ -7085,12 +7080,12 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code)
     }
     g->cur_x += 4;
 
-    if(g->cur_x >= g->max_x)
+    if (g->cur_x >= g->max_x)
     {
         g->cur_x = g->start_x;
         g->cur_y += g->step;
 
-        while(g->cur_y >= g->max_y && g->parse > 0)
+        while (g->cur_y >= g->max_y && g->parse > 0)
         {
             g->step = (1 << g->parse) * g->line_size;
             g->cur_y = g->start_y + (g->step >> 1);
@@ -7108,7 +7103,7 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g)
     stbi__gif_lzw* p;
 
     lzw_cs = stbi__get8(s);
-    if(lzw_cs > 12)
+    if (lzw_cs > 12)
         return NULL;
     clear = 1 << lzw_cs;
     first = 1;
@@ -7116,11 +7111,11 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g)
     codemask = (1 << codesize) - 1;
     bits = 0;
     valid_bits = 0;
-    for(init_code = 0; init_code < clear; init_code++)
+    for (init_code = 0; init_code < clear; init_code++)
     {
         g->codes[init_code].prefix = -1;
-        g->codes[init_code].first = ( stbi_uc )init_code;
-        g->codes[init_code].suffix = ( stbi_uc )init_code;
+        g->codes[init_code].first = (stbi_uc)init_code;
+        g->codes[init_code].suffix = (stbi_uc)init_code;
     }
 
     // support no starting clear code
@@ -7128,18 +7123,18 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g)
     oldcode = -1;
 
     len = 0;
-    for(;;)
+    for (;;)
     {
-        if(valid_bits < codesize)
+        if (valid_bits < codesize)
         {
-            if(len == 0)
+            if (len == 0)
             {
-                len = stbi__get8(s);    // start new block
-                if(len == 0)
+                len = stbi__get8(s); // start new block
+                if (len == 0)
                     return g->out;
             }
             --len;
-            bits |= ( stbi__int32 )stbi__get8(s) << valid_bits;
+            bits |= (stbi__int32)stbi__get8(s) << valid_bits;
             valid_bits += 8;
         }
         else
@@ -7148,46 +7143,46 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g)
             bits >>= codesize;
             valid_bits -= codesize;
             // @OPTIMIZE: is there some way we can accelerate the non-clear path?
-            if(code == clear)
-            {    // clear code
+            if (code == clear)
+            { // clear code
                 codesize = lzw_cs + 1;
                 codemask = (1 << codesize) - 1;
                 avail = clear + 2;
                 oldcode = -1;
                 first = 0;
             }
-            else if(code == clear + 1)
-            {    // end of stream code
+            else if (code == clear + 1)
+            { // end of stream code
                 stbi__skip(s, len);
-                while((len = stbi__get8(s)) > 0)
+                while ((len = stbi__get8(s)) > 0)
                     stbi__skip(s, len);
                 return g->out;
             }
-            else if(code <= avail)
+            else if (code <= avail)
             {
-                if(first)
+                if (first)
                 {
                     return stbi__errpuc("no clear code", "Corrupt GIF");
                 }
 
-                if(oldcode >= 0)
+                if (oldcode >= 0)
                 {
                     p = &g->codes[avail++];
-                    if(avail > 8192)
+                    if (avail > 8192)
                     {
                         return stbi__errpuc("too many codes", "Corrupt GIF");
                     }
 
-                    p->prefix = ( stbi__int16 )oldcode;
+                    p->prefix = (stbi__int16)oldcode;
                     p->first = g->codes[oldcode].first;
                     p->suffix = (code == avail) ? p->first : g->codes[code].first;
                 }
-                else if(code == avail)
+                else if (code == avail)
                     return stbi__errpuc("illegal code in raster", "Corrupt GIF");
 
-                stbi__out_gif_code(g, ( stbi__uint16 )code);
+                stbi__out_gif_code(g, (stbi__uint16)code);
 
-                if((avail & codemask) == 0 && avail <= 0x0FFF)
+                if ((avail & codemask) == 0 && avail <= 0x0FFF)
                 {
                     codesize++;
                     codemask = (1 << codesize) - 1;
@@ -7214,22 +7209,22 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i
 
     // on first frame, any non-written pixels get the background colour (non-transparent)
     first_frame = 0;
-    if(g->out == 0)
-    {
-        if(!stbi__gif_header(s, g, comp, 0))
-            return 0;    // stbi__g_failure_reason set by stbi__gif_header
-        g->out = ( stbi_uc* )stbi__malloc(4 * (size_t)(g->w) * g->h);
-        g->background = ( stbi_uc* )stbi__malloc(4 * (size_t)(g->w) * g->h);
-        g->history = ( stbi_uc* )stbi__malloc((size_t)(g->w) * g->h);
-        if(g->out == 0)
+    if (g->out == 0)
+    {
+        if (!stbi__gif_header(s, g, comp, 0))
+            return 0; // stbi__g_failure_reason set by stbi__gif_header
+        g->out = (stbi_uc*)stbi__malloc(4 * (size_t)(g->w) * g->h);
+        g->background = (stbi_uc*)stbi__malloc(4 * (size_t)(g->w) * g->h);
+        g->history = (stbi_uc*)stbi__malloc((size_t)(g->w) * g->h);
+        if (g->out == 0)
             return stbi__errpuc("outofmem", "Out of memory");
 
         // image is treated as "tranparent" at the start - ie, nothing overwrites the current background;
         // background colour is only used for pixels that are not rendered first frame, after that "background"
         // color refers to teh color that was there the previous frame.
         memset(g->out, 0x00, 4 * (size_t)(g->w) * g->h);
-        memset(g->background, 0x00, 4 * (size_t)(g->w) * g->h);    // state of the background (starts transparent)
-        memset(g->history, 0x00, (size_t)(g->w) * g->h);    // pixels that were affected previous frame
+        memset(g->background, 0x00, 4 * (size_t)(g->w) * g->h); // state of the background (starts transparent)
+        memset(g->history, 0x00, (size_t)(g->w) * g->h);        // pixels that were affected previous frame
         first_frame = 1;
     }
     else
@@ -7238,27 +7233,27 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i
         dispose = (g->eflags & 0x1C) >> 2;
         pcount = g->w * g->h;
 
-        if((dispose == 3) && (two_back == 0))
+        if ((dispose == 3) && (two_back == 0))
         {
-            dispose = 2;    // if I don't have an image to revert back to, default to the old background
+            dispose = 2; // if I don't have an image to revert back to, default to the old background
         }
 
-        if(dispose == 3)
-        {    // use previous graphic
-            for(pi = 0; pi < pcount; ++pi)
+        if (dispose == 3)
+        { // use previous graphic
+            for (pi = 0; pi < pcount; ++pi)
             {
-                if(g->history[pi])
+                if (g->history[pi])
                 {
                     memcpy(&g->out[pi * 4], &two_back[pi * 4], 4);
                 }
             }
         }
-        else if(dispose == 2)
+        else if (dispose == 2)
         {
             // restore what was changed last frame to background before that frame;
-            for(pi = 0; pi < pcount; ++pi)
+            for (pi = 0; pi < pcount; ++pi)
             {
-                if(g->history[pi])
+                if (g->history[pi])
                 {
                     memcpy(&g->out[pi * 4], &g->background[pi * 4], 4);
                 }
@@ -7277,139 +7272,139 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i
     }
 
     // clear my history;
-    memset(g->history, 0x00, (size_t)(g->w) * g->h);    // pixels that were affected previous frame
+    memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame
 
-    for(;;)
+    for (;;)
     {
         int tag = stbi__get8(s);
-        switch(tag)
+        switch (tag)
+        {
+        case 0x2C: /* Image Descriptor */
         {
-            case 0x2C: /* Image Descriptor */
+            stbi__int32 x, y, w, h;
+            stbi_uc* o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+                return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x = g->start_x + w * 4;
+            g->max_y = g->start_y + h * g->line_size;
+            g->cur_x = g->start_x;
+            g->cur_y = g->start_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40)
             {
-                stbi__int32 x, y, w, h;
-                stbi_uc* o;
-
-                x = stbi__get16le(s);
-                y = stbi__get16le(s);
-                w = stbi__get16le(s);
-                h = stbi__get16le(s);
-                if(((x + w) > (g->w)) || ((y + h) > (g->h)))
-                    return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
-
-                g->line_size = g->w * 4;
-                g->start_x = x * 4;
-                g->start_y = y * g->line_size;
-                g->max_x = g->start_x + w * 4;
-                g->max_y = g->start_y + h * g->line_size;
-                g->cur_x = g->start_x;
-                g->cur_y = g->start_y;
-
-                g->lflags = stbi__get8(s);
-
-                if(g->lflags & 0x40)
-                {
-                    g->step = 8 * g->line_size;    // first interlaced spacing
-                    g->parse = 3;
-                }
-                else
-                {
-                    g->step = g->line_size;
-                    g->parse = 0;
-                }
+                g->step = 8 * g->line_size; // first interlaced spacing
+                g->parse = 3;
+            }
+            else
+            {
+                g->step = g->line_size;
+                g->parse = 0;
+            }
 
-                if(g->lflags & 0x80)
-                {
-                    stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7),
-                                               g->eflags & 0x01 ? g->transparent : -1);
-                    g->color_table = ( stbi_uc* )g->lpal;
-                }
-                else if(g->flags & 0x80)
-                {
-                    g->color_table = ( stbi_uc* )g->pal;
-                }
-                else
-                    return stbi__errpuc("missing color table", "Corrupt GIF");
+            if (g->lflags & 0x80)
+            {
+                stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7),
+                                           g->eflags & 0x01 ? g->transparent : -1);
+                g->color_table = (stbi_uc*)g->lpal;
+            }
+            else if (g->flags & 0x80)
+            {
+                g->color_table = (stbi_uc*)g->pal;
+            }
+            else
+                return stbi__errpuc("missing color table", "Corrupt GIF");
 
-                o = stbi__process_gif_raster(s, g);
-                if(o == NULL)
-                    return NULL;
+            o = stbi__process_gif_raster(s, g);
+            if (o == NULL)
+                return NULL;
 
-                // if this was the first frame,
-                pcount = g->w * g->h;
-                if(first_frame && (g->bgindex > 0))
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0))
+            {
+                // if first frame, any pixel not drawn to gets the background color
+                for (pi = 0; pi < pcount; ++pi)
                 {
-                    // if first frame, any pixel not drawn to gets the background color
-                    for(pi = 0; pi < pcount; ++pi)
+                    if (g->history[pi] == 0)
                     {
-                        if(g->history[pi] == 0)
-                        {
-                            g->pal[g->bgindex][3] = 255;    // just in case it was made transparent, undo that; It will
-                                                            // be reset next frame if need be;
-                            memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4);
-                        }
+                        g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will
+                                                     // be reset next frame if need be;
+                        memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4);
                     }
                 }
-
-                return o;
             }
 
-            case 0x21:    // Comment Extension.
-            {
-                int len;
-                int ext = stbi__get8(s);
-                if(ext == 0xF9)
-                {    // Graphic Control Extension.
-                    len = stbi__get8(s);
-                    if(len == 4)
-                    {
-                        g->eflags = stbi__get8(s);
-                        g->delay = 10 * stbi__get16le(s);    // delay - 1/100th of a second, saving as 1/1000ths.
+            return o;
+        }
 
-                        // unset old transparent
-                        if(g->transparent >= 0)
-                        {
-                            g->pal[g->transparent][3] = 255;
-                        }
-                        if(g->eflags & 0x01)
-                        {
-                            g->transparent = stbi__get8(s);
-                            if(g->transparent >= 0)
-                            {
-                                g->pal[g->transparent][3] = 0;
-                            }
-                        }
-                        else
+        case 0x21: // Comment Extension.
+        {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9)
+            { // Graphic Control Extension.
+                len = stbi__get8(s);
+                if (len == 4)
+                {
+                    g->eflags = stbi__get8(s);
+                    g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                    // unset old transparent
+                    if (g->transparent >= 0)
+                    {
+                        g->pal[g->transparent][3] = 255;
+                    }
+                    if (g->eflags & 0x01)
+                    {
+                        g->transparent = stbi__get8(s);
+                        if (g->transparent >= 0)
                         {
-                            // don't need transparent
-                            stbi__skip(s, 1);
-                            g->transparent = -1;
+                            g->pal[g->transparent][3] = 0;
                         }
                     }
                     else
                     {
-                        stbi__skip(s, len);
-                        break;
+                        // don't need transparent
+                        stbi__skip(s, 1);
+                        g->transparent = -1;
                     }
                 }
-                while((len = stbi__get8(s)) != 0)
+                else
                 {
                     stbi__skip(s, len);
+                    break;
                 }
-                break;
             }
+            while ((len = stbi__get8(s)) != 0)
+            {
+                stbi__skip(s, len);
+            }
+            break;
+        }
 
-            case 0x3B:    // gif stream termination code
-                return ( stbi_uc* )s;    // using '1' causes warning on some compilers
+        case 0x3B:              // gif stream termination code
+            return (stbi_uc*)s; // using '1' causes warning on some compilers
 
-            default:
-                return stbi__errpuc("unknown code", "Corrupt GIF");
+        default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
         }
     }
 }
 
 static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, int* z, int* comp, int req_comp)
 {
-    if(stbi__gif_test(s))
+    if (stbi__gif_test(s))
     {
         int layers = 0;
         stbi_uc* u = 0;
@@ -7418,7 +7413,7 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y,
         stbi__gif g;
         int stride;
         memset(&g, 0, sizeof(g));
-        if(delays)
+        if (delays)
         {
             *delays = 0;
         }
@@ -7426,44 +7421,44 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y,
         do
         {
             u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
-            if(u == ( stbi_uc* )s)
-                u = 0;    // end of animated gif marker
+            if (u == (stbi_uc*)s)
+                u = 0; // end of animated gif marker
 
-            if(u)
+            if (u)
             {
                 *x = g.w;
                 *y = g.h;
                 ++layers;
                 stride = g.w * g.h * 4;
 
-                if(out)
+                if (out)
                 {
-                    out = ( stbi_uc* )STBI_REALLOC(out, (size_t)layers * stride);
-                    if(delays)
+                    out = (stbi_uc*)STBI_REALLOC(out, (size_t)layers * stride);
+                    if (delays)
                     {
-                        *delays = ( int* )STBI_REALLOC(*delays, sizeof(int) * layers);
+                        *delays = (int*)STBI_REALLOC(*delays, sizeof(int) * layers);
                     }
                 }
                 else
                 {
-                    out = ( stbi_uc* )stbi__malloc((size_t)layers * stride);
-                    if(delays)
+                    out = (stbi_uc*)stbi__malloc((size_t)layers * stride);
+                    if (delays)
                     {
-                        *delays = ( int* )stbi__malloc(layers * sizeof(int));
+                        *delays = (int*)stbi__malloc(layers * sizeof(int));
                     }
                 }
                 memcpy(out + ((layers - 1) * stride), u, stride);
-                if(layers >= 2)
+                if (layers >= 2)
                 {
                     two_back = out - 2 * stride;
                 }
 
-                if(delays)
+                if (delays)
                 {
                     (*delays)[layers - 1U] = g.delay;
                 }
             }
-        } while(u != 0);
+        } while (u != 0);
 
         // free temp buffer;
         STBI_FREE(g.out);
@@ -7471,7 +7466,7 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y,
         STBI_FREE(g.background);
 
         // do the final conversion after loading everything;
-        if(req_comp && req_comp != 4)
+        if (req_comp && req_comp != 4)
             out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
 
         *z = layers;
@@ -7490,16 +7485,16 @@ static void* stbi__gif_load(stbi__context* s, int* x, int* y, int* comp, int req
     memset(&g, 0, sizeof(g));
 
     u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
-    if(u == ( stbi_uc* )s)
-        u = 0;    // end of animated gif marker
-    if(u)
+    if (u == (stbi_uc*)s)
+        u = 0; // end of animated gif marker
+    if (u)
     {
         *x = g.w;
         *y = g.h;
 
         // moved conversion to after successful load so that the same
         // can be done for multiple frames.
-        if(req_comp && req_comp != 4)
+        if (req_comp && req_comp != 4)
             u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
     }
 
@@ -7523,8 +7518,8 @@ static int stbi__gif_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__hdr_test_core(stbi__context* s, const char* signature)
 {
     int i;
-    for(i = 0; signature[i]; ++i)
-        if(stbi__get8(s) != signature[i])
+    for (i = 0; signature[i]; ++i)
+        if (stbi__get8(s) != signature[i])
             return 0;
     stbi__rewind(s);
     return 1;
@@ -7534,7 +7529,7 @@ static int stbi__hdr_test(stbi__context* s)
 {
     int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
     stbi__rewind(s);
-    if(!r)
+    if (!r)
     {
         r = stbi__hdr_test_core(s, "#?RGBE\n");
         stbi__rewind(s);
@@ -7548,19 +7543,19 @@ static char* stbi__hdr_gettoken(stbi__context* z, char* buffer)
     int len = 0;
     char c = '\0';
 
-    c = ( char )stbi__get8(z);
+    c = (char)stbi__get8(z);
 
-    while(!stbi__at_eof(z) && c != '\n')
+    while (!stbi__at_eof(z) && c != '\n')
     {
         buffer[len++] = c;
-        if(len == STBI__HDR_BUFLEN - 1)
+        if (len == STBI__HDR_BUFLEN - 1)
         {
             // flush to end of line
-            while(!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
                 ;
             break;
         }
-        c = ( char )stbi__get8(z);
+        c = (char)stbi__get8(z);
     }
 
     buffer[len] = 0;
@@ -7569,12 +7564,12 @@ static char* stbi__hdr_gettoken(stbi__context* z, char* buffer)
 
 static void stbi__hdr_convert(float* output, stbi_uc* input, int req_comp)
 {
-    if(input[3] != 0)
+    if (input[3] != 0)
     {
         float f1;
         // Exponent
-        f1 = ( float )ldexp(1.0f, input[3] - ( int )(128 + 8));
-        if(req_comp <= 2)
+        f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8));
+        if (req_comp <= 2)
             output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
         else
         {
@@ -7582,25 +7577,25 @@ static void stbi__hdr_convert(float* output, stbi_uc* input, int req_comp)
             output[1] = input[1] * f1;
             output[2] = input[2] * f1;
         }
-        if(req_comp == 2)
+        if (req_comp == 2)
             output[1] = 1;
-        if(req_comp == 4)
+        if (req_comp == 4)
             output[3] = 1;
     }
     else
     {
-        switch(req_comp)
+        switch (req_comp)
         {
-            case 4:
-                output[3] = 1; /* fallthrough */
-            case 3:
-                output[0] = output[1] = output[2] = 0;
-                break;
-            case 2:
-                output[1] = 1; /* fallthrough */
-            case 1:
-                output[0] = 0;
-                break;
+        case 4:
+            output[3] = 1; /* fallthrough */
+        case 3:
+            output[0] = output[1] = output[2] = 0;
+            break;
+        case 2:
+            output[1] = 1; /* fallthrough */
+        case 1:
+            output[0] = 0;
+            break;
         }
     }
 }
@@ -7621,63 +7616,63 @@ static float* stbi__hdr_load(stbi__context* s, int* x, int* y, int* comp, int re
 
     // Check identifier
     headerToken = stbi__hdr_gettoken(s, buffer);
-    if(strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+    if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
         return stbi__errpf("not HDR", "Corrupt HDR image");
 
     // Parse header
-    for(;;)
+    for (;;)
     {
         token = stbi__hdr_gettoken(s, buffer);
-        if(token[0] == 0)
+        if (token[0] == 0)
             break;
-        if(strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
+        if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
             valid = 1;
     }
 
-    if(!valid)
+    if (!valid)
         return stbi__errpf("unsupported format", "Unsupported HDR format");
 
     // Parse width and height
     // can't use sscanf() if we're not using stdio!
     token = stbi__hdr_gettoken(s, buffer);
-    if(strncmp(token, "-Y ", 3))
+    if (strncmp(token, "-Y ", 3))
         return stbi__errpf("unsupported data layout", "Unsupported HDR format");
     token += 3;
-    height = ( int )strtol(token, &token, 10);
-    while(*token == ' ')
+    height = (int)strtol(token, &token, 10);
+    while (*token == ' ')
         ++token;
-    if(strncmp(token, "+X ", 3))
+    if (strncmp(token, "+X ", 3))
         return stbi__errpf("unsupported data layout", "Unsupported HDR format");
     token += 3;
-    width = ( int )strtol(token, NULL, 10);
+    width = (int)strtol(token, NULL, 10);
 
     *x = width;
     *y = height;
 
-    if(comp)
+    if (comp)
         *comp = 3;
-    if(req_comp == 0)
+    if (req_comp == 0)
         req_comp = 3;
 
-    if(!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+    if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
         return stbi__errpf("too large", "HDR image is too large");
 
     // Read data
-    hdr_data = ( float* )stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
-    if(!hdr_data)
+    hdr_data = (float*)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+    if (!hdr_data)
         return stbi__errpf("outofmem", "Out of memory");
 
     // Load image data
     // image data is stored as some number of sca
-    if(width < 8 || width >= 32768)
+    if (width < 8 || width >= 32768)
     {
         // Read flat data
-        for(j = 0; j < height; ++j)
+        for (j = 0; j < height; ++j)
         {
-            for(i = 0; i < width; ++i)
+            for (i = 0; i < width; ++i)
             {
                 stbi_uc rgbe[4];
-            main_decode_loop:
+main_decode_loop:
                 stbi__getn(s, rgbe, 4);
                 stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
             }
@@ -7688,83 +7683,83 @@ static float* stbi__hdr_load(stbi__context* s, int* x, int* y, int* comp, int re
         // Read RLE-encoded data
         scanline = NULL;
 
-        for(j = 0; j < height; ++j)
+        for (j = 0; j < height; ++j)
         {
             c1 = stbi__get8(s);
             c2 = stbi__get8(s);
             len = stbi__get8(s);
-            if(c1 != 2 || c2 != 2 || (len & 0x80))
+            if (c1 != 2 || c2 != 2 || (len & 0x80))
             {
                 // not run-length encoded, so we have to actually use THIS data as a decoded
                 // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
                 stbi_uc rgbe[4];
-                rgbe[0] = ( stbi_uc )c1;
-                rgbe[1] = ( stbi_uc )c2;
-                rgbe[2] = ( stbi_uc )len;
-                rgbe[3] = ( stbi_uc )stbi__get8(s);
+                rgbe[0] = (stbi_uc)c1;
+                rgbe[1] = (stbi_uc)c2;
+                rgbe[2] = (stbi_uc)len;
+                rgbe[3] = (stbi_uc)stbi__get8(s);
                 stbi__hdr_convert(hdr_data, rgbe, req_comp);
                 i = 1;
                 j = 0;
                 STBI_FREE(scanline);
-                goto main_decode_loop;    // yes, this makes no sense
+                goto main_decode_loop; // yes, this makes no sense
             }
             len <<= 8;
             len |= stbi__get8(s);
-            if(len != width)
+            if (len != width)
             {
                 STBI_FREE(hdr_data);
                 STBI_FREE(scanline);
                 return stbi__errpf("invalid decoded scanline length", "corrupt HDR");
             }
-            if(scanline == NULL)
+            if (scanline == NULL)
             {
-                scanline = ( stbi_uc* )stbi__malloc_mad2(width, 4, 0);
-                if(!scanline)
+                scanline = (stbi_uc*)stbi__malloc_mad2(width, 4, 0);
+                if (!scanline)
                 {
                     STBI_FREE(hdr_data);
                     return stbi__errpf("outofmem", "Out of memory");
                 }
             }
 
-            for(k = 0; k < 4; ++k)
+            for (k = 0; k < 4; ++k)
             {
                 int nleft;
                 i = 0;
-                while((nleft = width - i) > 0)
+                while ((nleft = width - i) > 0)
                 {
                     count = stbi__get8(s);
-                    if(count > 128)
+                    if (count > 128)
                     {
                         // Run
                         value = stbi__get8(s);
                         count -= 128;
-                        if(count > nleft)
+                        if (count > nleft)
                         {
                             STBI_FREE(hdr_data);
                             STBI_FREE(scanline);
                             return stbi__errpf("corrupt", "bad RLE data in HDR");
                         }
-                        for(z = 0; z < count; ++z)
+                        for (z = 0; z < count; ++z)
                             scanline[i++ * 4 + k] = value;
                     }
                     else
                     {
                         // Dump
-                        if(count > nleft)
+                        if (count > nleft)
                         {
                             STBI_FREE(hdr_data);
                             STBI_FREE(scanline);
                             return stbi__errpf("corrupt", "bad RLE data in HDR");
                         }
-                        for(z = 0; z < count; ++z)
+                        for (z = 0; z < count; ++z)
                             scanline[i++ * 4 + k] = stbi__get8(s);
                     }
                 }
             }
-            for(i = 0; i < width; ++i)
+            for (i = 0; i < width; ++i)
                 stbi__hdr_convert(hdr_data + (j * width + i) * req_comp, scanline + i * 4, req_comp);
         }
-        if(scanline)
+        if (scanline)
             STBI_FREE(scanline);
     }
 
@@ -7778,54 +7773,54 @@ static int stbi__hdr_info(stbi__context* s, int* x, int* y, int* comp)
     int valid = 0;
     int dummy;
 
-    if(!x)
+    if (!x)
         x = &dummy;
-    if(!y)
+    if (!y)
         y = &dummy;
-    if(!comp)
+    if (!comp)
         comp = &dummy;
 
-    if(stbi__hdr_test(s) == 0)
+    if (stbi__hdr_test(s) == 0)
     {
         stbi__rewind(s);
         return 0;
     }
 
-    for(;;)
+    for (;;)
     {
         token = stbi__hdr_gettoken(s, buffer);
-        if(token[0] == 0)
+        if (token[0] == 0)
             break;
-        if(strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
+        if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
             valid = 1;
     }
 
-    if(!valid)
+    if (!valid)
     {
         stbi__rewind(s);
         return 0;
     }
     token = stbi__hdr_gettoken(s, buffer);
-    if(strncmp(token, "-Y ", 3))
+    if (strncmp(token, "-Y ", 3))
     {
         stbi__rewind(s);
         return 0;
     }
     token += 3;
-    *y = ( int )strtol(token, &token, 10);
-    while(*token == ' ')
+    *y = (int)strtol(token, &token, 10);
+    while (*token == ' ')
         ++token;
-    if(strncmp(token, "+X ", 3))
+    if (strncmp(token, "+X ", 3))
     {
         stbi__rewind(s);
         return 0;
     }
     token += 3;
-    *x = ( int )strtol(token, NULL, 10);
+    *x = (int)strtol(token, NULL, 10);
     *comp = 3;
     return 1;
 }
-#endif    // STBI_NO_HDR
+#endif // STBI_NO_HDR
 
 #ifndef STBI_NO_BMP
 static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp)
@@ -7836,13 +7831,13 @@ static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp)
     info.all_a = 255;
     p = stbi__bmp_parse_header(s, &info);
     stbi__rewind(s);
-    if(p == NULL)
+    if (p == NULL)
         return 0;
-    if(x)
+    if (x)
         *x = s->img_x;
-    if(y)
+    if (y)
         *y = s->img_y;
-    if(comp)
+    if (comp)
         *comp = info.ma ? 4 : 3;
     return 1;
 }
@@ -7852,25 +7847,25 @@ static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp)
 {
     int channelCount, dummy, depth;
-    if(!x)
+    if (!x)
         x = &dummy;
-    if(!y)
+    if (!y)
         y = &dummy;
-    if(!comp)
+    if (!comp)
         comp = &dummy;
-    if(stbi__get32be(s) != 0x38425053)
+    if (stbi__get32be(s) != 0x38425053)
     {
         stbi__rewind(s);
         return 0;
     }
-    if(stbi__get16be(s) != 1)
+    if (stbi__get16be(s) != 1)
     {
         stbi__rewind(s);
         return 0;
     }
     stbi__skip(s, 6);
     channelCount = stbi__get16be(s);
-    if(channelCount < 0 || channelCount > 16)
+    if (channelCount < 0 || channelCount > 16)
     {
         stbi__rewind(s);
         return 0;
@@ -7878,12 +7873,12 @@ static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp)
     *y = stbi__get32be(s);
     *x = stbi__get32be(s);
     depth = stbi__get16be(s);
-    if(depth != 8 && depth != 16)
+    if (depth != 8 && depth != 16)
     {
         stbi__rewind(s);
         return 0;
     }
-    if(stbi__get16be(s) != 3)
+    if (stbi__get16be(s) != 3)
     {
         stbi__rewind(s);
         return 0;
@@ -7895,27 +7890,27 @@ static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__psd_is16(stbi__context* s)
 {
     int channelCount, depth;
-    if(stbi__get32be(s) != 0x38425053)
+    if (stbi__get32be(s) != 0x38425053)
     {
         stbi__rewind(s);
         return 0;
     }
-    if(stbi__get16be(s) != 1)
+    if (stbi__get16be(s) != 1)
     {
         stbi__rewind(s);
         return 0;
     }
     stbi__skip(s, 6);
     channelCount = stbi__get16be(s);
-    if(channelCount < 0 || channelCount > 16)
+    if (channelCount < 0 || channelCount > 16)
     {
         stbi__rewind(s);
         return 0;
     }
-    ( void )stbi__get32be(s);
-    ( void )stbi__get32be(s);
+    (void)stbi__get32be(s);
+    (void)stbi__get32be(s);
     depth = stbi__get16be(s);
-    if(depth != 16)
+    if (depth != 16)
     {
         stbi__rewind(s);
         return 0;
@@ -7930,14 +7925,14 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
     int act_comp = 0, num_packets = 0, chained, dummy;
     stbi__pic_packet packets[10];
 
-    if(!x)
+    if (!x)
         x = &dummy;
-    if(!y)
+    if (!y)
         y = &dummy;
-    if(!comp)
+    if (!comp)
         comp = &dummy;
 
-    if(!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
+    if (!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
     {
         stbi__rewind(s);
         return 0;
@@ -7947,12 +7942,12 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
 
     *x = stbi__get16be(s);
     *y = stbi__get16be(s);
-    if(stbi__at_eof(s))
+    if (stbi__at_eof(s))
     {
         stbi__rewind(s);
         return 0;
     }
-    if((*x) != 0 && (1 << 28) / (*x) < (*y))
+    if ((*x) != 0 && (1 << 28) / (*x) < (*y))
     {
         stbi__rewind(s);
         return 0;
@@ -7964,7 +7959,7 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
     {
         stbi__pic_packet* packet;
 
-        if(num_packets == sizeof(packets) / sizeof(packets[0]))
+        if (num_packets == sizeof(packets) / sizeof(packets[0]))
             return 0;
 
         packet = &packets[num_packets++];
@@ -7974,17 +7969,17 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
         packet->channel = stbi__get8(s);
         act_comp |= packet->channel;
 
-        if(stbi__at_eof(s))
+        if (stbi__at_eof(s))
         {
             stbi__rewind(s);
             return 0;
         }
-        if(packet->size != 8)
+        if (packet->size != 8)
         {
             stbi__rewind(s);
             return 0;
         }
-    } while(chained);
+    } while (chained);
 
     *comp = (act_comp & 0x10 ? 4 : 3);
 
@@ -8009,9 +8004,9 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__pnm_test(stbi__context* s)
 {
     char p, t;
-    p = ( char )stbi__get8(s);
-    t = ( char )stbi__get8(s);
-    if(p != 'P' || (t != '5' && t != '6'))
+    p = (char)stbi__get8(s);
+    t = (char)stbi__get8(s);
+    if (p != 'P' || (t != '5' && t != '6'))
     {
         stbi__rewind(s);
         return 0;
@@ -8024,27 +8019,27 @@ static void* stbi__pnm_load(stbi__context* s, int* x, int* y, int* comp, int req
     stbi_uc* out;
     STBI_NOTUSED(ri);
 
-    if(!stbi__pnm_info(s, ( int* )&s->img_x, ( int* )&s->img_y, ( int* )&s->img_n))
+    if (!stbi__pnm_info(s, (int*)&s->img_x, (int*)&s->img_y, (int*)&s->img_n))
         return 0;
 
     *x = s->img_x;
     *y = s->img_y;
-    if(comp)
+    if (comp)
         *comp = s->img_n;
 
-    if(!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+    if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
         return stbi__errpuc("too large", "PNM too large");
 
-    out = ( stbi_uc* )stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
-    if(!out)
+    out = (stbi_uc*)stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
+    if (!out)
         return stbi__errpuc("outofmem", "Out of memory");
     stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
 
-    if(req_comp && req_comp != s->img_n)
+    if (req_comp && req_comp != s->img_n)
     {
         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
-        if(out == NULL)
-            return out;    // stbi__convert_format frees input on failure
+        if (out == NULL)
+            return out; // stbi__convert_format frees input on failure
     }
     return out;
 }
@@ -8056,16 +8051,16 @@ static int stbi__pnm_isspace(char c)
 
 static void stbi__pnm_skip_whitespace(stbi__context* s, char* c)
 {
-    for(;;)
+    for (;;)
     {
-        while(!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-            *c = ( char )stbi__get8(s);
+        while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+            *c = (char)stbi__get8(s);
 
-        if(stbi__at_eof(s) || *c != '#')
+        if (stbi__at_eof(s) || *c != '#')
             break;
 
-        while(!stbi__at_eof(s) && *c != '\n' && *c != '\r')
-            *c = ( char )stbi__get8(s);
+        while (!stbi__at_eof(s) && *c != '\n' && *c != '\r')
+            *c = (char)stbi__get8(s);
     }
 }
 
@@ -8078,10 +8073,10 @@ static int stbi__pnm_getinteger(stbi__context* s, char* c)
 {
     int value = 0;
 
-    while(!stbi__at_eof(s) && stbi__pnm_isdigit(*c))
+    while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c))
     {
         value = value * 10 + (*c - '0');
-        *c = ( char )stbi__get8(s);
+        *c = (char)stbi__get8(s);
     }
 
     return value;
@@ -8092,38 +8087,38 @@ static int stbi__pnm_info(stbi__context* s, int* x, int* y, int* comp)
     int maxv, dummy;
     char c, p, t;
 
-    if(!x)
+    if (!x)
         x = &dummy;
-    if(!y)
+    if (!y)
         y = &dummy;
-    if(!comp)
+    if (!comp)
         comp = &dummy;
 
     stbi__rewind(s);
 
     // Get identifier
-    p = ( char )stbi__get8(s);
-    t = ( char )stbi__get8(s);
-    if(p != 'P' || (t != '5' && t != '6'))
+    p = (char)stbi__get8(s);
+    t = (char)stbi__get8(s);
+    if (p != 'P' || (t != '5' && t != '6'))
     {
         stbi__rewind(s);
         return 0;
     }
 
-    *comp = (t == '6') ? 3 : 1;    // '5' is 1-component .pgm; '6' is 3-component .ppm
+    *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm
 
-    c = ( char )stbi__get8(s);
+    c = (char)stbi__get8(s);
     stbi__pnm_skip_whitespace(s, &c);
 
-    *x = stbi__pnm_getinteger(s, &c);    // read width
+    *x = stbi__pnm_getinteger(s, &c); // read width
     stbi__pnm_skip_whitespace(s, &c);
 
-    *y = stbi__pnm_getinteger(s, &c);    // read height
+    *y = stbi__pnm_getinteger(s, &c); // read height
     stbi__pnm_skip_whitespace(s, &c);
 
-    maxv = stbi__pnm_getinteger(s, &c);    // read max value
+    maxv = stbi__pnm_getinteger(s, &c); // read max value
 
-    if(maxv > 255)
+    if (maxv > 255)
         return stbi__err("max value > 255", "PPM image not 8-bit");
     else
         return 1;
@@ -8133,48 +8128,48 @@ static int stbi__pnm_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__info_main(stbi__context* s, int* x, int* y, int* comp)
 {
 #ifndef STBI_NO_JPEG
-    if(stbi__jpeg_info(s, x, y, comp))
+    if (stbi__jpeg_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_PNG
-    if(stbi__png_info(s, x, y, comp))
+    if (stbi__png_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_GIF
-    if(stbi__gif_info(s, x, y, comp))
+    if (stbi__gif_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_BMP
-    if(stbi__bmp_info(s, x, y, comp))
+    if (stbi__bmp_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_PSD
-    if(stbi__psd_info(s, x, y, comp))
+    if (stbi__psd_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_PIC
-    if(stbi__pic_info(s, x, y, comp))
+    if (stbi__pic_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_PNM
-    if(stbi__pnm_info(s, x, y, comp))
+    if (stbi__pnm_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_HDR
-    if(stbi__hdr_info(s, x, y, comp))
+    if (stbi__hdr_info(s, x, y, comp))
         return 1;
 #endif
 
 // test tga last because it's a crappy test!
 #ifndef STBI_NO_TGA
-    if(stbi__tga_info(s, x, y, comp))
+    if (stbi__tga_info(s, x, y, comp))
         return 1;
 #endif
     return stbi__err("unknown image type", "Image not of any known type, or corrupt");
@@ -8183,12 +8178,12 @@ static int stbi__info_main(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__is_16_main(stbi__context* s)
 {
 #ifndef STBI_NO_PNG
-    if(stbi__png_is16(s))
+    if (stbi__png_is16(s))
         return 1;
 #endif
 
 #ifndef STBI_NO_PSD
-    if(stbi__psd_is16(s))
+    if (stbi__psd_is16(s))
         return 1;
 #endif
 
@@ -8200,7 +8195,7 @@ extern int stbi_info(char const* filename, int* x, int* y, int* comp)
 {
     FILE* f = stbi__fopen(filename, "rb");
     int result;
-    if(!f)
+    if (!f)
         return stbi__err("can't fopen", "Unable to open file");
     result = stbi_info_from_file(f, x, y, comp);
     fclose(f);
@@ -8222,7 +8217,7 @@ extern int stbi_is_16_bit(char const* filename)
 {
     FILE* f = stbi__fopen(filename, "rb");
     int result;
-    if(!f)
+    if (!f)
         return stbi__err("can't fopen", "Unable to open file");
     result = stbi_is_16_bit_from_file(f);
     fclose(f);
@@ -8239,7 +8234,7 @@ extern int stbi_is_16_bit_from_file(FILE* f)
     fseek(f, pos, SEEK_SET);
     return r;
 }
-#endif    // !STBI_NO_STDIO
+#endif // !STBI_NO_STDIO
 
 extern int stbi_info_from_memory(stbi_uc const* buffer, int len, int* x, int* y, int* comp)
 {
@@ -8251,7 +8246,7 @@ extern int stbi_info_from_memory(stbi_uc const* buffer, int len, int* x, int* y,
 extern int stbi_info_from_callbacks(stbi_io_callbacks const* c, void* user, int* x, int* y, int* comp)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )c, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)c, user);
     return stbi__info_main(&s, x, y, comp);
 }
 
@@ -8265,11 +8260,11 @@ extern int stbi_is_16_bit_from_memory(stbi_uc const* buffer, int len)
 extern int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const* c, void* user)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )c, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)c, user);
     return stbi__is_16_main(&s);
 }
 
-#endif    // STB_IMAGE_IMPLEMENTATION
+#endif // STB_IMAGE_IMPLEMENTATION
 
 /*
    revision history:
diff --git a/examples/common/stb_image_write.h b/examples/common/stb_image_write.h
index 42b7c1796..fe585cf94 100644
--- a/examples/common/stb_image_write.h
+++ b/examples/common/stb_image_write.h
@@ -14,7 +14,7 @@
 #endif
 #endif
 
-#ifndef STB_IMAGE_WRITE_STATIC    // C++ forbids static forward declarations
+#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations
 extern int stbi_write_tga_with_rle;
 extern int stbi_write_png_compression_level;
 extern int stbi_write_force_png_filter;
@@ -40,7 +40,7 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x,
 
 STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 
-#endif    // INCLUDE_STB_IMAGE_WRITE_H
+#endif // INCLUDE_STB_IMAGE_WRITE_H
 
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
@@ -56,7 +56,7 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 
 #ifndef STBI_WRITE_NO_STDIO
 #include <stdio.h>
-#endif    // STBI_WRITE_NO_STDIO
+#endif // STBI_WRITE_NO_STDIO
 
 #include <stdarg.h>
 #include <stdlib.h>
@@ -72,9 +72,9 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 #endif
 
 #ifndef STBIW_MALLOC
-#define STBIW_MALLOC(sz) malloc(sz)
+#define STBIW_MALLOC(sz)        malloc(sz)
 #define STBIW_REALLOC(p, newsz) realloc(p, newsz)
-#define STBIW_FREE(p) free(p)
+#define STBIW_FREE(p)           free(p)
 #endif
 
 #ifndef STBIW_REALLOC_SIZED
@@ -90,7 +90,7 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 #define STBIW_ASSERT(x) assert(x)
 #endif
 
-#define STBIW_UCHAR(x) ( unsigned char )(( x )&0xff)
+#define STBIW_UCHAR(x) (unsigned char)((x)&0xff)
 
 #ifdef STB_IMAGE_WRITE_STATIC
 static int stbi__flip_vertically_on_write = 0;
@@ -126,69 +126,69 @@ static void stbi__start_write_callbacks(stbi__write_context* s, stbi_write_func*
 
 static void stbi__stdio_write(void* context, void* data, int size)
 {
-    fwrite(data, 1, size, ( FILE* )context);
+    fwrite(data, 1, size, (FILE*)context);
 }
 
 static int stbi__start_write_file(stbi__write_context* s, const char* filename)
 {
     FILE* f;
 #ifdef STBI_MSC_SECURE_CRT
-    if(fopen_s(&f, filename, "wb"))
+    if (fopen_s(&f, filename, "wb"))
         f = NULL;
 #else
     f = fopen(filename, "wb");
 #endif
-    stbi__start_write_callbacks(s, stbi__stdio_write, ( void* )f);
+    stbi__start_write_callbacks(s, stbi__stdio_write, (void*)f);
     return f != NULL;
 }
 
 static void stbi__end_write_file(stbi__write_context* s)
 {
-    fclose(( FILE* )s->context);
+    fclose((FILE*)s->context);
 }
 
-#endif    // !STBI_WRITE_NO_STDIO
+#endif // !STBI_WRITE_NO_STDIO
 
 typedef unsigned int stbiw_uint32;
 typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1];
 
 static void stbiw__writefv(stbi__write_context* s, const char* fmt, va_list v)
 {
-    while(*fmt)
+    while (*fmt)
     {
-        switch(*fmt++)
+        switch (*fmt++)
         {
-            case ' ':
-                break;
-            case '1':
-            {
-                unsigned char x = STBIW_UCHAR(va_arg(v, int));
-                s->func(s->context, &x, 1);
-                break;
-            }
-            case '2':
-            {
-                int x = va_arg(v, int);
-                unsigned char b[2];
-                b[0] = STBIW_UCHAR(x);
-                b[1] = STBIW_UCHAR(x >> 8);
-                s->func(s->context, b, 2);
-                break;
-            }
-            case '4':
-            {
-                stbiw_uint32 x = va_arg(v, int);
-                unsigned char b[4];
-                b[0] = STBIW_UCHAR(x);
-                b[1] = STBIW_UCHAR(x >> 8);
-                b[2] = STBIW_UCHAR(x >> 16);
-                b[3] = STBIW_UCHAR(x >> 24);
-                s->func(s->context, b, 4);
-                break;
-            }
-            default:
-                STBIW_ASSERT(0);
-                return;
+        case ' ':
+            break;
+        case '1':
+        {
+            unsigned char x = STBIW_UCHAR(va_arg(v, int));
+            s->func(s->context, &x, 1);
+            break;
+        }
+        case '2':
+        {
+            int x = va_arg(v, int);
+            unsigned char b[2];
+            b[0] = STBIW_UCHAR(x);
+            b[1] = STBIW_UCHAR(x >> 8);
+            s->func(s->context, b, 2);
+            break;
+        }
+        case '4':
+        {
+            stbiw_uint32 x = va_arg(v, int);
+            unsigned char b[4];
+            b[0] = STBIW_UCHAR(x);
+            b[1] = STBIW_UCHAR(x >> 8);
+            b[2] = STBIW_UCHAR(x >> 16);
+            b[3] = STBIW_UCHAR(x >> 24);
+            s->func(s->context, b, 4);
+            break;
+        }
+        default:
+            STBIW_ASSERT(0);
+            return;
         }
     }
 }
@@ -219,33 +219,33 @@ static void stbiw__write_pixel(stbi__write_context* s, int rgb_dir, int comp, in
     unsigned char bg[3] = {255, 0, 255}, px[3];
     int k;
 
-    if(write_alpha < 0)
+    if (write_alpha < 0)
         s->func(s->context, &d[comp - 1], 1);
 
-    switch(comp)
+    switch (comp)
     {
-        case 2:    // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
-        case 1:
-            if(expand_mono)
-                stbiw__write3(s, d[0], d[0], d[0]);    // monochrome bmp
-            else
-                s->func(s->context, d, 1);    // monochrome TGA
-            break;
-        case 4:
-            if(!write_alpha)
-            {
-                // composite against pink background
-                for(k = 0; k < 3; ++k)
-                    px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
-                stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
-                break;
-            }
-            /* FALLTHROUGH */
-        case 3:
-            stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+    case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+    case 1:
+        if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+        else
+            s->func(s->context, d, 1); // monochrome TGA
+        break;
+    case 4:
+        if (!write_alpha)
+        {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+                px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
             break;
+        }
+        /* FALLTHROUGH */
+    case 3:
+        stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+        break;
     }
-    if(write_alpha > 0)
+    if (write_alpha > 0)
         s->func(s->context, &d[comp - 1], 1);
 }
 
@@ -255,22 +255,22 @@ static void stbiw__write_pixels(stbi__write_context* s, int rgb_dir, int vdir, i
     stbiw_uint32 zero = 0;
     int i, j, j_end;
 
-    if(y <= 0)
+    if (y <= 0)
         return;
 
-    if(stbi__flip_vertically_on_write)
+    if (stbi__flip_vertically_on_write)
         vdir *= -1;
 
-    if(vdir < 0)
+    if (vdir < 0)
         j_end = -1, j = y - 1;
     else
         j_end = y, j = 0;
 
-    for(; j != j_end; j += vdir)
+    for (; j != j_end; j += vdir)
     {
-        for(i = 0; i < x; ++i)
+        for (i = 0; i < x; ++i)
         {
-            unsigned char* d = ( unsigned char* )data + (j * x + i) * comp;
+            unsigned char* d = (unsigned char*)data + (j * x + i) * comp;
             stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
         }
         s->func(s->context, &zero, scanline_pad);
@@ -280,7 +280,7 @@ static void stbiw__write_pixels(stbi__write_context* s, int rgb_dir, int vdir, i
 static int stbiw__outfile(stbi__write_context* s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono,
                           void* data, int alpha, int pad, const char* fmt, ...)
 {
-    if(y < 0 || x < 0)
+    if (y < 0 || x < 0)
     {
         return 0;
     }
@@ -298,11 +298,11 @@ static int stbiw__outfile(stbi__write_context* s, int rgb_dir, int vdir, int x,
 static int stbi_write_bmp_core(stbi__write_context* s, int x, int y, int comp, const void* data)
 {
     int pad = (-x * 3) & 3;
-    return stbiw__outfile(s, -1, -1, x, y, comp, 1, ( void* )data, 0, pad,
+    return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void*)data, 0, pad,
                           "11 4 22 4"
                           "4 44 22 444444",
-                          'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, 14 + 40,    // file header
-                          40, x, y, 1, 24, 0, 0, 0, 0, 0, 0);    // bitmap header
+                          'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, 14 + 40, // file header
+                          40, x, y, 1, 24, 0, 0, 0, 0, 0, 0);                   // bitmap header
 }
 
 STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int x, int y, int comp, const void* data)
@@ -316,7 +316,7 @@ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int x,
 STBIWDEF int stbi_write_bmp(char const* filename, int x, int y, int comp, const void* data)
 {
     stbi__write_context s;
-    if(stbi__start_write_file(&s, filename))
+    if (stbi__start_write_file(&s, filename))
     {
         int r = stbi_write_bmp_core(&s, x, y, comp, data);
         stbi__end_write_file(&s);
@@ -325,20 +325,20 @@ STBIWDEF int stbi_write_bmp(char const* filename, int x, int y, int comp, const
     else
         return 0;
 }
-#endif    //! STBI_WRITE_NO_STDIO
+#endif //! STBI_WRITE_NO_STDIO
 
 static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, void* data)
 {
     int has_alpha = (comp == 2 || comp == 4);
     int colorbytes = has_alpha ? comp - 1 : comp;
-    int format = colorbytes < 2 ? 3 : 2;    // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+    int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
 
-    if(y < 0 || x < 0)
+    if (y < 0 || x < 0)
         return 0;
 
-    if(!stbi_write_tga_with_rle)
+    if (!stbi_write_tga_with_rle)
     {
-        return stbiw__outfile(s, -1, -1, x, y, comp, 0, ( void* )data, has_alpha, 0, "111 221 2222 11", 0, 0, format, 0,
+        return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void*)data, has_alpha, 0, "111 221 2222 11", 0, 0, format, 0,
                               0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
     }
     else
@@ -349,7 +349,7 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v
         stbiw__writef(s, "111 221 2222 11", 0, 0, format + 8, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8,
                       has_alpha * 8);
 
-        if(stbi__flip_vertically_on_write)
+        if (stbi__flip_vertically_on_write)
         {
             j = 0;
             jend = y;
@@ -361,27 +361,27 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v
             jend = -1;
             jdir = -1;
         }
-        for(; j != jend; j += jdir)
+        for (; j != jend; j += jdir)
         {
-            unsigned char* row = ( unsigned char* )data + j * x * comp;
+            unsigned char* row = (unsigned char*)data + j * x * comp;
             int len;
 
-            for(i = 0; i < x; i += len)
+            for (i = 0; i < x; i += len)
             {
                 unsigned char* begin = row + i * comp;
                 int diff = 1;
                 len = 1;
 
-                if(i < x - 1)
+                if (i < x - 1)
                 {
                     ++len;
                     diff = memcmp(begin, row + (i + 1) * comp, comp);
-                    if(diff)
+                    if (diff)
                     {
                         const unsigned char* prev = begin;
-                        for(k = i + 2; k < x && len < 128; ++k)
+                        for (k = i + 2; k < x && len < 128; ++k)
                         {
-                            if(memcmp(prev, row + k * comp, comp))
+                            if (memcmp(prev, row + k * comp, comp))
                             {
                                 prev += comp;
                                 ++len;
@@ -395,9 +395,9 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v
                     }
                     else
                     {
-                        for(k = i + 2; k < x && len < 128; ++k)
+                        for (k = i + 2; k < x && len < 128; ++k)
                         {
-                            if(!memcmp(begin, row + k * comp, comp))
+                            if (!memcmp(begin, row + k * comp, comp))
                             {
                                 ++len;
                             }
@@ -409,11 +409,11 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v
                     }
                 }
 
-                if(diff)
+                if (diff)
                 {
                     unsigned char header = STBIW_UCHAR(len - 1);
                     s->func(s->context, &header, 1);
-                    for(k = 0; k < len; ++k)
+                    for (k = 0; k < len; ++k)
                     {
                         stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
                     }
@@ -434,16 +434,16 @@ STBIWDEF int stbi_write_tga_to_func(stbi_write_func* func, void* context, int x,
 {
     stbi__write_context s;
     stbi__start_write_callbacks(&s, func, context);
-    return stbi_write_tga_core(&s, x, y, comp, ( void* )data);
+    return stbi_write_tga_core(&s, x, y, comp, (void*)data);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_tga(char const* filename, int x, int y, int comp, const void* data)
 {
     stbi__write_context s;
-    if(stbi__start_write_file(&s, filename))
+    if (stbi__start_write_file(&s, filename))
     {
-        int r = stbi_write_tga_core(&s, x, y, comp, ( void* )data);
+        int r = stbi_write_tga_core(&s, x, y, comp, (void*)data);
         stbi__end_write_file(&s);
         return r;
     }
@@ -463,18 +463,18 @@ void stbiw__linear_to_rgbe(unsigned char* rgbe, float* linear)
     int exponent;
     float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
 
-    if(maxcomp < 1e-32f)
+    if (maxcomp < 1e-32f)
     {
         rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
     }
     else
     {
-        float normalize = ( float )frexp(maxcomp, &exponent) * 256.0f / maxcomp;
+        float normalize = (float)frexp(maxcomp, &exponent) * 256.0f / maxcomp;
 
-        rgbe[0] = ( unsigned char )(linear[0] * normalize);
-        rgbe[1] = ( unsigned char )(linear[1] * normalize);
-        rgbe[2] = ( unsigned char )(linear[2] * normalize);
-        rgbe[3] = ( unsigned char )(exponent + 128);
+        rgbe[0] = (unsigned char)(linear[0] * normalize);
+        rgbe[1] = (unsigned char)(linear[1] * normalize);
+        rgbe[2] = (unsigned char)(linear[2] * normalize);
+        rgbe[3] = (unsigned char)(exponent + 128);
     }
 }
 
@@ -489,7 +489,7 @@ void stbiw__write_run_data(stbi__write_context* s, int length, unsigned char dat
 void stbiw__write_dump_data(stbi__write_context* s, int length, unsigned char* data)
 {
     unsigned char lengthbyte = STBIW_UCHAR(length);
-    STBIW_ASSERT(length <= 128);    // inconsistent with spec but consistent with official code
+    STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
     s->func(s->context, &lengthbyte, 1);
     s->func(s->context, data, length);
 }
@@ -505,21 +505,21 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns
     scanlineheader[3] = (width & 0x00ff);
 
     /* skip RLE for images too small or large */
-    if(width < 8 || width >= 32768)
+    if (width < 8 || width >= 32768)
     {
-        for(x = 0; x < width; x++)
+        for (x = 0; x < width; x++)
         {
-            switch(ncomp)
+            switch (ncomp)
             {
-                case 4: /* fallthrough */
-                case 3:
-                    linear[2] = scanline[x * ncomp + 2];
-                    linear[1] = scanline[x * ncomp + 1];
-                    linear[0] = scanline[x * ncomp + 0];
-                    break;
-                default:
-                    linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
-                    break;
+            case 4: /* fallthrough */
+            case 3:
+                linear[2] = scanline[x * ncomp + 2];
+                linear[1] = scanline[x * ncomp + 1];
+                linear[0] = scanline[x * ncomp + 0];
+                break;
+            default:
+                linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
+                break;
             }
             stbiw__linear_to_rgbe(rgbe, linear);
             s->func(s->context, rgbe, 4);
@@ -529,19 +529,19 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns
     {
         int c, r;
         /* encode into scratch buffer */
-        for(x = 0; x < width; x++)
+        for (x = 0; x < width; x++)
         {
-            switch(ncomp)
+            switch (ncomp)
             {
-                case 4: /* fallthrough */
-                case 3:
-                    linear[2] = scanline[x * ncomp + 2];
-                    linear[1] = scanline[x * ncomp + 1];
-                    linear[0] = scanline[x * ncomp + 0];
-                    break;
-                default:
-                    linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
-                    break;
+            case 4: /* fallthrough */
+            case 3:
+                linear[2] = scanline[x * ncomp + 2];
+                linear[1] = scanline[x * ncomp + 1];
+                linear[0] = scanline[x * ncomp + 0];
+                break;
+            default:
+                linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
+                break;
             }
             stbiw__linear_to_rgbe(rgbe, linear);
             scratch[x + width * 0] = rgbe[0];
@@ -553,43 +553,43 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns
         s->func(s->context, scanlineheader, 4);
 
         /* RLE each component separately */
-        for(c = 0; c < 4; c++)
+        for (c = 0; c < 4; c++)
         {
             unsigned char* comp = &scratch[width * c];
 
             x = 0;
-            while(x < width)
+            while (x < width)
             {
                 // find first run
                 r = x;
-                while(r + 2 < width)
+                while (r + 2 < width)
                 {
-                    if(comp[r] == comp[r + 1] && comp[r] == comp[r + 2])
+                    if (comp[r] == comp[r + 1] && comp[r] == comp[r + 2])
                         break;
                     ++r;
                 }
-                if(r + 2 >= width)
+                if (r + 2 >= width)
                     r = width;
                 // dump up to first run
-                while(x < r)
+                while (x < r)
                 {
                     int len = r - x;
-                    if(len > 128)
+                    if (len > 128)
                         len = 128;
                     stbiw__write_dump_data(s, len, &comp[x]);
                     x += len;
                 }
                 // if there's a run, output it
-                if(r + 2 < width)
-                {    // same test as what we break out of in search loop, so only true if we break'd
+                if (r + 2 < width)
+                { // same test as what we break out of in search loop, so only true if we break'd
                     // find next byte after run
-                    while(r < width && comp[r] == comp[x])
+                    while (r < width && comp[r] == comp[x])
                         ++r;
                     // output run up to r
-                    while(x < r)
+                    while (x < r)
                     {
                         int len = r - x;
-                        if(len > 127)
+                        if (len > 127)
                             len = 127;
                         stbiw__write_run_data(s, len, comp[x]);
                         x += len;
@@ -602,12 +602,12 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns
 
 static int stbi_write_hdr_core(stbi__write_context* s, int x, int y, int comp, float* data)
 {
-    if(y <= 0 || x <= 0 || data == NULL)
+    if (y <= 0 || x <= 0 || data == NULL)
         return 0;
     else
     {
         // Each component is stored separately. Allocate scratch space for full output scanline.
-        unsigned char* scratch = ( unsigned char* )STBIW_MALLOC(x * 4);
+        unsigned char* scratch = (unsigned char*)STBIW_MALLOC(x * 4);
         int i, len;
         char buffer[128];
         char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
@@ -620,7 +620,7 @@ static int stbi_write_hdr_core(stbi__write_context* s, int x, int y, int comp, f
 #endif
         s->func(s->context, buffer, len);
 
-        for(i = 0; i < y; i++)
+        for (i = 0; i < y; i++)
             stbiw__write_hdr_scanline(s, x, comp, scratch,
                                       data + comp * x * (stbi__flip_vertically_on_write ? y - 1 - i : i) * x);
         STBIW_FREE(scratch);
@@ -632,23 +632,23 @@ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func* func, void* context, int x,
 {
     stbi__write_context s;
     stbi__start_write_callbacks(&s, func, context);
-    return stbi_write_hdr_core(&s, x, y, comp, ( float* )data);
+    return stbi_write_hdr_core(&s, x, y, comp, (float*)data);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_hdr(char const* filename, int x, int y, int comp, const float* data)
 {
     stbi__write_context s;
-    if(stbi__start_write_file(&s, filename))
+    if (stbi__start_write_file(&s, filename))
     {
-        int r = stbi_write_hdr_core(&s, x, y, comp, ( float* )data);
+        int r = stbi_write_hdr_core(&s, x, y, comp, (float*)data);
         stbi__end_write_file(&s);
         return r;
     }
     else
         return 0;
 }
-#endif    // STBI_WRITE_NO_STDIO
+#endif // STBI_WRITE_NO_STDIO
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -657,30 +657,29 @@ STBIWDEF int stbi_write_hdr(char const* filename, int x, int y, int comp, const
 
 #ifndef STBIW_ZLIB_COMPRESS
 // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
-#define stbiw__sbraw(a) (( int* )( a )-2)
-#define stbiw__sbm(a) stbiw__sbraw(a)[0]
-#define stbiw__sbn(a) stbiw__sbraw(a)[1]
+#define stbiw__sbraw(a) ((int*)(a)-2)
+#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
+#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
 
-#define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a))
+#define stbiw__sbneedgrow(a, n)  ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a))
 #define stbiw__sbmaybegrow(a, n) (stbiw__sbneedgrow(a, (n)) ? stbiw__sbgrow(a, n) : 0)
-#define stbiw__sbgrow(a, n) stbiw__sbgrowf(( void** )&(a), (n), sizeof(*(a)))
+#define stbiw__sbgrow(a, n)      stbiw__sbgrowf((void**)&(a), (n), sizeof(*(a)))
 
 #define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a, 1), (a)[stbiw__sbn(a)++] = (v))
-#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0)
-#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0)
+#define stbiw__sbcount(a)   ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a)    ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0)
 
 static void* stbiw__sbgrowf(void** arr, int increment, int itemsize)
 {
     int m = *arr ? 2 * stbiw__sbm(*arr) + increment : increment + 1;
-    void* p =
-        STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0,
-                            (unsigned long)itemsize * m + sizeof(int) * 2);
+    void* p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0,
+                                  (unsigned long)itemsize * m + sizeof(int) * 2);
     STBIW_ASSERT(p);
-    if(p)
+    if (p)
     {
-        if(!*arr)
-            (( int* )p)[1] = 0;
-        *arr = ( void* )(( int* )p + 2);
+        if (!*arr)
+            ((int*)p)[1] = 0;
+        *arr = (void*)((int*)p + 2);
         stbiw__sbm(*arr) = m;
     }
     return *arr;
@@ -688,7 +687,7 @@ static void* stbiw__sbgrowf(void** arr, int increment, int itemsize)
 
 static unsigned char* stbiw__zlib_flushf(unsigned char* data, unsigned int* bitbuffer, int* bitcount)
 {
-    while(*bitcount >= 8)
+    while (*bitcount >= 8)
     {
         stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
         *bitbuffer >>= 8;
@@ -700,7 +699,7 @@ static unsigned char* stbiw__zlib_flushf(unsigned char* data, unsigned int* bitb
 static int stbiw__zlib_bitrev(int code, int codebits)
 {
     int res = 0;
-    while(codebits--)
+    while (codebits--)
     {
         res = (res << 1) | (code & 1);
         code >>= 1;
@@ -711,8 +710,8 @@ static int stbiw__zlib_bitrev(int code, int codebits)
 static unsigned int stbiw__zlib_countm(unsigned char* a, unsigned char* b, int limit)
 {
     int i;
-    for(i = 0; i < limit && i < 258; ++i)
-        if(a[i] != b[i])
+    for (i = 0; i < limit && i < 258; ++i)
+        if (a[i] != b[i])
             break;
     return i;
 }
@@ -729,93 +728,94 @@ static unsigned int stbiw__zhash(unsigned char* data)
     return hash;
 }
 
-#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
+#define stbiw__zlib_flush()             (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
 #define stbiw__zlib_add(code, codebits) (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
-#define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c)
+#define stbiw__zlib_huffa(b, c)         stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c)
 // default huffman tables
 #define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8)
-#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + ( n )-144, 9)
-#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + ( n )-256, 7)
-#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + ( n )-280, 8)
-#define stbiw__zlib_huff(n)              \
-    ((n) <= 143 ? stbiw__zlib_huff1(n) : \
-                  (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256, 7)
+#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280, 8)
+#define stbiw__zlib_huff(n)                                                \
+    ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) \
+                                     : (n) <= 279   ? stbiw__zlib_huff3(n) \
+                                                    : stbiw__zlib_huff4(n))
 #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
 
 #define stbiw__ZHASH 16384
 
-#endif    // STBIW_ZLIB_COMPRESS
+#endif // STBIW_ZLIB_COMPRESS
 
 unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_len, int quality)
 {
 #ifdef STBIW_ZLIB_COMPRESS
     // user provided a zlib compress implementation, use that
     return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
-#else    // use builtin
-    static unsigned short lengthc[] = {3,  4,  5,  6,  7,  8,  9,  10, 11,  13,  15,  17,  19,  23,  27,
+#else  // use builtin
+    static unsigned short lengthc[] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27,
                                        31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259};
     static unsigned char lengtheb[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
                                        2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
-    static unsigned short distc[] = {1,    2,    3,    4,    5,    7,     9,     13,    17,   25,   33,
-                                     49,   65,   97,   129,  193,  257,   385,   513,   769,  1025, 1537,
+    static unsigned short distc[] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33,
+                                     49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537,
                                      2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768};
-    static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2,  3,  3,  4,  4,  5,  5,  6,
+    static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
                                      6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
     unsigned int bitbuf = 0;
     int i, j, bitcount = 0;
     unsigned char* out = NULL;
-    unsigned char*** hash_table = ( unsigned char*** )STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
-    if(hash_table == NULL)
+    unsigned char*** hash_table = (unsigned char***)STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
+    if (hash_table == NULL)
         return NULL;
-    if(quality < 5)
+    if (quality < 5)
         quality = 5;
 
-    stbiw__sbpush(out, 0x78);    // DEFLATE 32K window
-    stbiw__sbpush(out, 0x5e);    // FLEVEL = 1
+    stbiw__sbpush(out, 0x78); // DEFLATE 32K window
+    stbiw__sbpush(out, 0x5e); // FLEVEL = 1
     stbiw__zlib_add(1, 1);    // BFINAL = 1
     stbiw__zlib_add(1, 2);    // BTYPE = 1 -- fixed huffman
 
-    for(i = 0; i < stbiw__ZHASH; ++i)
+    for (i = 0; i < stbiw__ZHASH; ++i)
         hash_table[i] = NULL;
 
     i = 0;
-    while(i < data_len - 3)
+    while (i < data_len - 3)
     {
         // hash next 3 bytes of data to be compressed
         int h = stbiw__zhash(data + i) & (stbiw__ZHASH - 1), best = 3;
         unsigned char* bestloc = 0;
         unsigned char** hlist = hash_table[h];
         int n = stbiw__sbcount(hlist);
-        for(j = 0; j < n; ++j)
+        for (j = 0; j < n; ++j)
         {
-            if(hlist[j] - data > i - 32768)
-            {    // if entry lies within window
+            if (hlist[j] - data > i - 32768)
+            { // if entry lies within window
                 int d = stbiw__zlib_countm(hlist[j], data + i, data_len - i);
-                if(d >= best)
+                if (d >= best)
                     best = d, bestloc = hlist[j];
             }
         }
         // when hash table entry is too long, delete half the entries
-        if(hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality)
+        if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality)
         {
             STBIW_MEMMOVE(hash_table[h], hash_table[h] + quality, sizeof(hash_table[h][0]) * quality);
             stbiw__sbn(hash_table[h]) = quality;
         }
         stbiw__sbpush(hash_table[h], data + i);
 
-        if(bestloc)
+        if (bestloc)
         {
             // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
             h = stbiw__zhash(data + i + 1) & (stbiw__ZHASH - 1);
             hlist = hash_table[h];
             n = stbiw__sbcount(hlist);
-            for(j = 0; j < n; ++j)
+            for (j = 0; j < n; ++j)
             {
-                if(hlist[j] - data > i - 32767)
+                if (hlist[j] - data > i - 32767)
                 {
                     int e = stbiw__zlib_countm(hlist[j], data + i + 1, data_len - i - 1);
-                    if(e > best)
-                    {    // if next match is better, bail on current match
+                    if (e > best)
+                    { // if next match is better, bail on current match
                         bestloc = NULL;
                         break;
                     }
@@ -823,19 +823,19 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le
             }
         }
 
-        if(bestloc)
+        if (bestloc)
         {
-            int d = ( int )(data + i - bestloc);    // distance back
+            int d = (int)(data + i - bestloc); // distance back
             STBIW_ASSERT(d <= 32767 && best <= 258);
-            for(j = 0; best > lengthc[j + 1] - 1; ++j)
+            for (j = 0; best > lengthc[j + 1] - 1; ++j)
                 ;
             stbiw__zlib_huff(j + 257);
-            if(lengtheb[j])
+            if (lengtheb[j])
                 stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
-            for(j = 0; d > distc[j + 1] - 1; ++j)
+            for (j = 0; d > distc[j + 1] - 1; ++j)
                 ;
             stbiw__zlib_add(stbiw__zlib_bitrev(j, 5), 5);
-            if(disteb[j])
+            if (disteb[j])
                 stbiw__zlib_add(d - distc[j], disteb[j]);
             i += best;
         }
@@ -846,25 +846,25 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le
         }
     }
     // write out final bytes
-    for(; i < data_len; ++i)
+    for (; i < data_len; ++i)
         stbiw__zlib_huffb(data[i]);
-    stbiw__zlib_huff(256);    // end of block
+    stbiw__zlib_huff(256); // end of block
     // pad with 0 bits to byte boundary
-    while(bitcount)
+    while (bitcount)
         stbiw__zlib_add(0, 1);
 
-    for(i = 0; i < stbiw__ZHASH; ++i)
-        ( void )stbiw__sbfree(hash_table[i]);
+    for (i = 0; i < stbiw__ZHASH; ++i)
+        (void)stbiw__sbfree(hash_table[i]);
     STBIW_FREE(hash_table);
 
     {
         // compute adler32 on input
         unsigned int s1 = 1, s2 = 0;
-        int blocklen = ( int )(data_len % 5552);
+        int blocklen = (int)(data_len % 5552);
         j = 0;
-        while(j < data_len)
+        while (j < data_len)
         {
-            for(i = 0; i < blocklen; ++i)
+            for (i = 0; i < blocklen; ++i)
                 s1 += data[j + i], s2 += s1;
             s1 %= 65521, s2 %= 65521;
             j += blocklen;
@@ -878,8 +878,8 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le
     *out_len = stbiw__sbn(out);
     // make returned pointer freeable
     STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
-    return ( unsigned char* )stbiw__sbraw(out);
-#endif    // STBIW_ZLIB_COMPRESS
+    return (unsigned char*)stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
 }
 
 static unsigned int stbiw__crc32(unsigned char* buffer, int len)
@@ -917,14 +917,14 @@ static unsigned int stbiw__crc32(unsigned char* buffer, int len)
 
     unsigned int crc = ~0u;
     int i;
-    for(i = 0; i < len; ++i)
+    for (i = 0; i < len; ++i)
         crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
     return ~crc;
 }
 
 #define stbiw__wpng4(o, a, b, c, d) \
     ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c), (o)[3] = STBIW_UCHAR(d), (o) += 4)
-#define stbiw__wp32(data, v) stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v));
+#define stbiw__wp32(data, v)  stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v));
 #define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3])
 
 static void stbiw__wpcrc(unsigned char** data, int len)
@@ -936,9 +936,9 @@ static void stbiw__wpcrc(unsigned char** data, int len)
 static unsigned char stbiw__paeth(int a, int b, int c)
 {
     int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
-    if(pa <= pb && pa <= pc)
+    if (pa <= pb && pa <= pc)
         return STBIW_UCHAR(a);
-    if(pb <= pc)
+    if (pb <= pc)
         return STBIW_UCHAR(b);
     return STBIW_UCHAR(c);
 }
@@ -954,58 +954,58 @@ static void stbiw__encode_png_line(unsigned char* pixels, int stride_bytes, int
     int type = mymap[filter_type];
     unsigned char* z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y);
     int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
-    for(i = 0; i < n; ++i)
+    for (i = 0; i < n; ++i)
     {
-        switch(type)
+        switch (type)
         {
-            case 0:
-                line_buffer[i] = z[i];
-                break;
-            case 1:
-                line_buffer[i] = z[i];
-                break;
-            case 2:
-                line_buffer[i] = z[i] - z[i - signed_stride];
-                break;
-            case 3:
-                line_buffer[i] = z[i] - (z[i - signed_stride] >> 1);
-                break;
-            case 4:
-                line_buffer[i] = ( signed char )(z[i] - stbiw__paeth(0, z[i - signed_stride], 0));
-                break;
-            case 5:
-                line_buffer[i] = z[i];
-                break;
-            case 6:
-                line_buffer[i] = z[i];
-                break;
+        case 0:
+            line_buffer[i] = z[i];
+            break;
+        case 1:
+            line_buffer[i] = z[i];
+            break;
+        case 2:
+            line_buffer[i] = z[i] - z[i - signed_stride];
+            break;
+        case 3:
+            line_buffer[i] = z[i] - (z[i - signed_stride] >> 1);
+            break;
+        case 4:
+            line_buffer[i] = (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0));
+            break;
+        case 5:
+            line_buffer[i] = z[i];
+            break;
+        case 6:
+            line_buffer[i] = z[i];
+            break;
         }
     }
-    for(i = n; i < width * n; ++i)
+    for (i = n; i < width * n; ++i)
     {
-        switch(type)
+        switch (type)
         {
-            case 0:
-                line_buffer[i] = z[i];
-                break;
-            case 1:
-                line_buffer[i] = z[i] - z[i - n];
-                break;
-            case 2:
-                line_buffer[i] = z[i] - z[i - signed_stride];
-                break;
-            case 3:
-                line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1);
-                break;
-            case 4:
-                line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]);
-                break;
-            case 5:
-                line_buffer[i] = z[i] - (z[i - n] >> 1);
-                break;
-            case 6:
-                line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0);
-                break;
+        case 0:
+            line_buffer[i] = z[i];
+            break;
+        case 1:
+            line_buffer[i] = z[i] - z[i - n];
+            break;
+        case 2:
+            line_buffer[i] = z[i] - z[i - signed_stride];
+            break;
+        case 3:
+            line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1);
+            break;
+        case 4:
+            line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]);
+            break;
+        case 5:
+            line_buffer[i] = z[i] - (z[i - n] >> 1);
+            break;
+        case 6:
+            line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0);
+            break;
         }
     }
 }
@@ -1019,76 +1019,76 @@ unsigned char* stbi_write_png_to_mem(unsigned char* pixels, int stride_bytes, in
     signed char* line_buffer;
     int j, zlen;
 
-    if(stride_bytes == 0)
+    if (stride_bytes == 0)
         stride_bytes = x * n;
 
-    if(force_filter >= 5)
+    if (force_filter >= 5)
     {
         force_filter = -1;
     }
 
-    filt = ( unsigned char* )STBIW_MALLOC((x * n + 1) * (size_t)y);
-    if(!filt)
+    filt = (unsigned char*)STBIW_MALLOC((x * n + 1) * (size_t)y);
+    if (!filt)
         return 0;
-    line_buffer = ( signed char* )STBIW_MALLOC((size_t)x * n);
-    if(!line_buffer)
+    line_buffer = (signed char*)STBIW_MALLOC((size_t)x * n);
+    if (!line_buffer)
     {
         STBIW_FREE(filt);
         return 0;
     }
-    for(j = 0; j < y; ++j)
+    for (j = 0; j < y; ++j)
     {
         int filter_type;
-        if(force_filter > -1)
+        if (force_filter > -1)
         {
             filter_type = force_filter;
             stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, force_filter, line_buffer);
         }
         else
-        {    // Estimate the best filter by running through all of them:
+        { // Estimate the best filter by running through all of them:
             int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
-            for(filter_type = 0; filter_type < 5; filter_type++)
+            for (filter_type = 0; filter_type < 5; filter_type++)
             {
                 stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, filter_type, line_buffer);
 
                 // Estimate the entropy of the line using this filter; the less, the better.
                 est = 0;
-                for(i = 0; i < x * n; ++i)
+                for (i = 0; i < x * n; ++i)
                 {
-                    est += abs(( signed char )line_buffer[i]);
+                    est += abs((signed char)line_buffer[i]);
                 }
-                if(est < best_filter_val)
+                if (est < best_filter_val)
                 {
                     best_filter_val = est;
                     best_filter = filter_type;
                 }
             }
-            if(filter_type != best_filter)
-            {    // If the last iteration already got us the best filter, don't redo it
+            if (filter_type != best_filter)
+            { // If the last iteration already got us the best filter, don't redo it
                 stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, best_filter, line_buffer);
                 filter_type = best_filter;
             }
         }
         // when we get here, filter_type contains the filter type, and line_buffer contains the data
-        filt[j * (x * n + 1)] = ( unsigned char )filter_type;
+        filt[j * (x * n + 1)] = (unsigned char)filter_type;
         STBIW_MEMMOVE(filt + j * (x * n + 1) + 1, line_buffer, (size_t)x * n);
     }
     STBIW_FREE(line_buffer);
     zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen, stbi_write_png_compression_level);
     STBIW_FREE(filt);
-    if(!zlib)
+    if (!zlib)
         return 0;
 
     // each tag requires 12 bytes of overhead
-    out = ( unsigned char* )STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12);
-    if(!out)
+    out = (unsigned char*)STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12);
+    if (!out)
         return 0;
     *out_len = 8 + 12 + 13 + 12 + zlen + 12;
 
     o = out;
     STBIW_MEMMOVE(o, sig, 8);
     o += 8;
-    stbiw__wp32(o, 13);    // header length
+    stbiw__wp32(o, 13); // header length
     stbiw__wptag(o, "IHDR");
     stbiw__wp32(o, x);
     stbiw__wp32(o, y);
@@ -1120,16 +1120,16 @@ STBIWDEF int stbi_write_png(char const* filename, int x, int y, int comp, const
 {
     FILE* f;
     int len;
-    unsigned char* png = stbi_write_png_to_mem(( unsigned char* )data, stride_bytes, x, y, comp, &len);
-    if(png == NULL)
+    unsigned char* png = stbi_write_png_to_mem((unsigned char*)data, stride_bytes, x, y, comp, &len);
+    if (png == NULL)
         return 0;
 #ifdef STBI_MSC_SECURE_CRT
-    if(fopen_s(&f, filename, "wb"))
+    if (fopen_s(&f, filename, "wb"))
         f = NULL;
 #else
     f = fopen(filename, "wb");
 #endif
-    if(!f)
+    if (!f)
     {
         STBIW_FREE(png);
         return 0;
@@ -1145,8 +1145,8 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int x,
                                     int stride_bytes)
 {
     int len;
-    unsigned char* png = stbi_write_png_to_mem(( unsigned char* )data, stride_bytes, x, y, comp, &len);
-    if(png == NULL)
+    unsigned char* png = stbi_write_png_to_mem((unsigned char*)data, stride_bytes, x, y, comp, &len);
+    if (png == NULL)
         return 0;
     func(context, png, len);
     STBIW_FREE(png);
@@ -1161,8 +1161,8 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int x,
  * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
  */
 
-static const unsigned char stbiw__jpg_ZigZag[] = {0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42,
-                                                  3,  8,  12, 17, 25, 30, 41, 43, 9,  11, 18, 24, 31, 40, 44, 53,
+static const unsigned char stbiw__jpg_ZigZag[] = {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42,
+                                                  3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53,
                                                   10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
                                                   21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63};
 
@@ -1171,11 +1171,11 @@ static void stbiw__jpg_writeBits(stbi__write_context* s, int* bitBufP, int* bitC
     int bitBuf = *bitBufP, bitCnt = *bitCntP;
     bitCnt += bs[1];
     bitBuf |= bs[0] << (24 - bitCnt);
-    while(bitCnt >= 8)
+    while (bitCnt >= 8)
     {
         unsigned char c = (bitBuf >> 16) & 255;
         stbiw__putc(s, c);
-        if(c == 255)
+        if (c == 255)
         {
             stbiw__putc(s, 0);
         }
@@ -1202,33 +1202,33 @@ static void stbiw__jpg_DCT(float* d0p, float* d1p, float* d2p, float* d3p, float
     float tmp4 = d3 - d4;
 
     // Even part
-    float tmp10 = tmp0 + tmp3;    // phase 2
+    float tmp10 = tmp0 + tmp3; // phase 2
     float tmp13 = tmp0 - tmp3;
     float tmp11 = tmp1 + tmp2;
     float tmp12 = tmp1 - tmp2;
 
-    d0 = tmp10 + tmp11;    // phase 3
+    d0 = tmp10 + tmp11; // phase 3
     d4 = tmp10 - tmp11;
 
-    z1 = (tmp12 + tmp13) * 0.707106781f;    // c4
-    d2 = tmp13 + z1;    // phase 5
+    z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+    d2 = tmp13 + z1;                     // phase 5
     d6 = tmp13 - z1;
 
     // Odd part
-    tmp10 = tmp4 + tmp5;    // phase 2
+    tmp10 = tmp4 + tmp5; // phase 2
     tmp11 = tmp5 + tmp6;
     tmp12 = tmp6 + tmp7;
 
     // The rotator is modified from fig 4-8 to avoid extra negations.
-    z5 = (tmp10 - tmp12) * 0.382683433f;    // c6
-    z2 = tmp10 * 0.541196100f + z5;    // c2-c6
-    z4 = tmp12 * 1.306562965f + z5;    // c2+c6
-    z3 = tmp11 * 0.707106781f;    // c4
+    z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+    z2 = tmp10 * 0.541196100f + z5;      // c2-c6
+    z4 = tmp12 * 1.306562965f + z5;      // c2+c6
+    z3 = tmp11 * 0.707106781f;           // c4
 
-    z11 = tmp7 + z3;    // phase 5
+    z11 = tmp7 + z3; // phase 5
     z13 = tmp7 - z3;
 
-    *d5p = z13 + z2;    // phase 6
+    *d5p = z13 + z2; // phase 6
     *d3p = z13 - z2;
     *d1p = z11 + z4;
     *d7p = z11 - z4;
@@ -1244,7 +1244,7 @@ static void stbiw__jpg_calcBits(int val, unsigned short bits[2])
     int tmp1 = val < 0 ? -val : val;
     val = val < 0 ? val - 1 : val;
     bits[1] = 1;
-    while(tmp1 >>= 1)
+    while (tmp1 >>= 1)
     {
         ++bits[1];
     }
@@ -1260,29 +1260,29 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt
     int DU[64];
 
     // DCT rows
-    for(dataOff = 0; dataOff < 64; dataOff += 8)
+    for (dataOff = 0; dataOff < 64; dataOff += 8)
     {
         stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 1], &CDU[dataOff + 2], &CDU[dataOff + 3], &CDU[dataOff + 4],
                        &CDU[dataOff + 5], &CDU[dataOff + 6], &CDU[dataOff + 7]);
     }
     // DCT columns
-    for(dataOff = 0; dataOff < 8; ++dataOff)
+    for (dataOff = 0; dataOff < 8; ++dataOff)
     {
         stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 8], &CDU[dataOff + 16], &CDU[dataOff + 24], &CDU[dataOff + 32],
                        &CDU[dataOff + 40], &CDU[dataOff + 48], &CDU[dataOff + 56]);
     }
     // Quantize/descale/zigzag the coefficients
-    for(i = 0; i < 64; ++i)
+    for (i = 0; i < 64; ++i)
     {
         float v = CDU[i] * fdtbl[i];
         // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
-        DU[stbiw__jpg_ZigZag[i]] = ( int )(v < 0 ? v - 0.5f : v + 0.5f);
+        DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
     }
 
     // Encode DC
     diff = DU[0] - DC;
-    if(diff == 0)
+    if (diff == 0)
     {
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
     }
@@ -1295,29 +1295,29 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt
     }
     // Encode ACs
     end0pos = 63;
-    for(; (end0pos > 0) && (DU[end0pos] == 0); --end0pos)
+    for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos)
     {
     }
     // end0pos = first element in reverse order !=0
-    if(end0pos == 0)
+    if (end0pos == 0)
     {
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
         return DU[0];
     }
-    for(i = 1; i <= end0pos; ++i)
+    for (i = 1; i <= end0pos; ++i)
     {
         int startpos = i;
         int nrzeroes;
         unsigned short bits[2];
-        for(; DU[i] == 0 && i <= end0pos; ++i)
+        for (; DU[i] == 0 && i <= end0pos; ++i)
         {
         }
         nrzeroes = i - startpos;
-        if(nrzeroes >= 16)
+        if (nrzeroes >= 16)
         {
             int lng = nrzeroes >> 4;
             int nrmarker;
-            for(nrmarker = 1; nrmarker <= lng; ++nrmarker)
+            for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
                 stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
             nrzeroes &= 15;
         }
@@ -1325,7 +1325,7 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes << 4) + bits[1]]);
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
     }
-    if(end0pos != 63)
+    if (end0pos != 63)
     {
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
     }
@@ -1362,111 +1362,50 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in
         0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
         0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa};
     // Huffman tables
-    static const unsigned short YDC_HT[256][2] = {{0, 2},  {2, 3},  {3, 3},  {4, 3},   {5, 3},   {6, 3},
-                                                  {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}};
-    static const unsigned short UVDC_HT[256][2] = {{0, 2},  {1, 2},   {2, 2},   {6, 3},   {14, 4},    {30, 5},
-                                                   {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}};
+    static const unsigned short YDC_HT[256][2] = {{0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}};
+    static const unsigned short UVDC_HT[256][2] = {{0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}};
     static const unsigned short YAC_HT[256][2] = {
-        {10, 4},     {0, 2},      {1, 2},      {4, 3},      {11, 4},     {26, 5},     {120, 7},    {248, 8},
-        {1014, 10},  {65410, 16}, {65411, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {12, 4},     {27, 5},     {121, 7},    {502, 9},    {2038, 11},  {65412, 16}, {65413, 16},
-        {65414, 16}, {65415, 16}, {65416, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {28, 5},     {249, 8},    {1015, 10},  {4084, 12},  {65417, 16}, {65418, 16}, {65419, 16},
-        {65420, 16}, {65421, 16}, {65422, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {58, 6},     {503, 9},    {4085, 12},  {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16},
-        {65427, 16}, {65428, 16}, {65429, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {59, 6},     {1016, 10},  {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16},
-        {65435, 16}, {65436, 16}, {65437, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {122, 7},    {2039, 11},  {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16},
-        {65443, 16}, {65444, 16}, {65445, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {123, 7},    {4086, 12},  {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16},
-        {65451, 16}, {65452, 16}, {65453, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {250, 8},    {4087, 12},  {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16},
-        {65459, 16}, {65460, 16}, {65461, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {504, 9},    {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16},
-        {65467, 16}, {65468, 16}, {65469, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {505, 9},    {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16},
-        {65476, 16}, {65477, 16}, {65478, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {506, 9},    {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16},
-        {65485, 16}, {65486, 16}, {65487, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {1017, 10},  {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16},
-        {65494, 16}, {65495, 16}, {65496, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {1018, 10},  {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16},
-        {65503, 16}, {65504, 16}, {65505, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {2040, 11},  {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16},
-        {65512, 16}, {65513, 16}, {65514, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16},
-        {65522, 16}, {65523, 16}, {65524, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {2041, 11},  {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16},
-        {65532, 16}, {65533, 16}, {65534, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0}};
+        {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}};
     static const unsigned short UVAC_HT[256][2] = {
-        {0, 2},      {1, 2},      {4, 3},      {10, 4},     {24, 5},     {25, 5},     {56, 6},     {120, 7},
-        {500, 9},    {1014, 10},  {4084, 12},  {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {11, 4},     {57, 6},     {246, 8},    {501, 9},    {2038, 11},  {4085, 12},  {65416, 16},
-        {65417, 16}, {65418, 16}, {65419, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {26, 5},     {247, 8},    {1015, 10},  {4086, 12},  {32706, 15}, {65420, 16}, {65421, 16},
-        {65422, 16}, {65423, 16}, {65424, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {27, 5},     {248, 8},    {1016, 10},  {4087, 12},  {65425, 16}, {65426, 16}, {65427, 16},
-        {65428, 16}, {65429, 16}, {65430, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {58, 6},     {502, 9},    {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16},
-        {65436, 16}, {65437, 16}, {65438, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {59, 6},     {1017, 10},  {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16},
-        {65444, 16}, {65445, 16}, {65446, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {121, 7},    {2039, 11},  {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16},
-        {65452, 16}, {65453, 16}, {65454, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {122, 7},    {2040, 11},  {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16},
-        {65460, 16}, {65461, 16}, {65462, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {249, 8},    {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16},
-        {65469, 16}, {65470, 16}, {65471, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {503, 9},    {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16},
-        {65478, 16}, {65479, 16}, {65480, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {504, 9},    {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16},
-        {65487, 16}, {65488, 16}, {65489, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {505, 9},    {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16},
-        {65496, 16}, {65497, 16}, {65498, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {506, 9},    {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16},
-        {65505, 16}, {65506, 16}, {65507, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {2041, 11},  {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16},
-        {65514, 16}, {65515, 16}, {65516, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16},
-        {65523, 16}, {65524, 16}, {65525, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {1018, 10},  {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16},
-        {65532, 16}, {65533, 16}, {65534, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0}};
-    static const int YQT[] = {16, 11, 10, 16, 24,  40,  51,  61,  12, 12, 14, 19, 26,  58,  60,  55,
-                              14, 13, 16, 24, 40,  57,  69,  56,  14, 17, 22, 29, 51,  87,  80,  62,
-                              18, 22, 37, 56, 68,  109, 103, 77,  24, 35, 55, 64, 81,  104, 113, 92,
+        {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}};
+    static const int YQT[] = {16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55,
+                              14, 13, 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62,
+                              18, 22, 37, 56, 68, 109, 103, 77, 24, 35, 55, 64, 81, 104, 113, 92,
                               49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99};
     static const int UVQT[] = {17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, 26, 56, 99, 99, 99,
                                99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
                                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99};
-    static const float aasf[] = {1.0f * 2.828427125f,         1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f,
-                                 1.175875602f * 2.828427125f, 1.0f * 2.828427125f,         0.785694958f * 2.828427125f,
+    static const float aasf[] = {1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f,
+                                 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 0.785694958f * 2.828427125f,
                                  0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f};
 
     int row, col, i, k;
     float fdtbl_Y[64], fdtbl_UV[64];
     unsigned char YTable[64], UVTable[64];
 
-    if(!data || !width || !height || comp > 4 || comp < 1)
+    if (!data || !width || !height || comp > 4 || comp < 1)
     {
         return 0;
     }
 
     quality = quality ? quality : 90;
-    quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+    quality = quality < 1 ? 1 : quality > 100 ? 100
+                                              : quality;
     quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
 
-    for(i = 0; i < 64; ++i)
+    for (i = 0; i < 64; ++i)
     {
         int uvti, yti = (YQT[i] * quality + 50) / 100;
-        YTable[stbiw__jpg_ZigZag[i]] = ( unsigned char )(yti < 1 ? 1 : yti > 255 ? 255 : yti);
+        YTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(yti < 1 ? 1 : yti > 255 ? 255
+                                                                               : yti);
         uvti = (UVQT[i] * quality + 50) / 100;
-        UVTable[stbiw__jpg_ZigZag[i]] = ( unsigned char )(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+        UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(uvti < 1 ? 1 : uvti > 255 ? 255
+                                                                                  : uvti);
     }
 
-    for(row = 0, k = 0; row < 8; ++row)
+    for (row = 0, k = 0; row < 8; ++row)
     {
-        for(col = 0; col < 8; ++col, ++k)
+        for (col = 0; col < 8; ++col, ++k)
         {
             fdtbl_Y[k] = 1 / (YTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
             fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
@@ -1475,17 +1414,17 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in
 
     // Write Headers
     {
-        static const unsigned char head0[] = {0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F',  'I',  'F', 0,    1, 1,
-                                              0,    0,    1,    0,    1, 0,    0,   0xFF, 0xDB, 0,   0x84, 0};
+        static const unsigned char head0[] = {0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1,
+                                              0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0};
         static const unsigned char head2[] = {0xFF, 0xDA, 0, 0xC, 3, 1, 0, 2, 0x11, 3, 0x11, 0, 0x3F, 0};
         const unsigned char head1[] = {0xFF,
                                        0xC0,
                                        0,
                                        0x11,
                                        8,
-                                       ( unsigned char )(height >> 8),
+                                       (unsigned char)(height >> 8),
                                        STBIW_UCHAR(height),
-                                       ( unsigned char )(width >> 8),
+                                       (unsigned char)(width >> 8),
                                        STBIW_UCHAR(width),
                                        3,
                                        1,
@@ -1502,50 +1441,50 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in
                                        0x01,
                                        0xA2,
                                        0};
-        s->func(s->context, ( void* )head0, sizeof(head0));
-        s->func(s->context, ( void* )YTable, sizeof(YTable));
+        s->func(s->context, (void*)head0, sizeof(head0));
+        s->func(s->context, (void*)YTable, sizeof(YTable));
         stbiw__putc(s, 1);
         s->func(s->context, UVTable, sizeof(UVTable));
-        s->func(s->context, ( void* )head1, sizeof(head1));
-        s->func(s->context, ( void* )(std_dc_luminance_nrcodes + 1), sizeof(std_dc_luminance_nrcodes) - 1);
-        s->func(s->context, ( void* )std_dc_luminance_values, sizeof(std_dc_luminance_values));
-        stbiw__putc(s, 0x10);    // HTYACinfo
-        s->func(s->context, ( void* )(std_ac_luminance_nrcodes + 1), sizeof(std_ac_luminance_nrcodes) - 1);
-        s->func(s->context, ( void* )std_ac_luminance_values, sizeof(std_ac_luminance_values));
-        stbiw__putc(s, 1);    // HTUDCinfo
-        s->func(s->context, ( void* )(std_dc_chrominance_nrcodes + 1), sizeof(std_dc_chrominance_nrcodes) - 1);
-        s->func(s->context, ( void* )std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
-        stbiw__putc(s, 0x11);    // HTUACinfo
-        s->func(s->context, ( void* )(std_ac_chrominance_nrcodes + 1), sizeof(std_ac_chrominance_nrcodes) - 1);
-        s->func(s->context, ( void* )std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
-        s->func(s->context, ( void* )head2, sizeof(head2));
+        s->func(s->context, (void*)head1, sizeof(head1));
+        s->func(s->context, (void*)(std_dc_luminance_nrcodes + 1), sizeof(std_dc_luminance_nrcodes) - 1);
+        s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+        stbiw__putc(s, 0x10); // HTYACinfo
+        s->func(s->context, (void*)(std_ac_luminance_nrcodes + 1), sizeof(std_ac_luminance_nrcodes) - 1);
+        s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+        stbiw__putc(s, 1); // HTUDCinfo
+        s->func(s->context, (void*)(std_dc_chrominance_nrcodes + 1), sizeof(std_dc_chrominance_nrcodes) - 1);
+        s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+        stbiw__putc(s, 0x11); // HTUACinfo
+        s->func(s->context, (void*)(std_ac_chrominance_nrcodes + 1), sizeof(std_ac_chrominance_nrcodes) - 1);
+        s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+        s->func(s->context, (void*)head2, sizeof(head2));
     }
 
     // Encode 8x8 macroblocks
     {
         static const unsigned short fillBits[] = {0x7F, 7};
-        const unsigned char* imageData = ( const unsigned char* )data;
+        const unsigned char* imageData = (const unsigned char*)data;
         int DCY = 0, DCU = 0, DCV = 0;
         int bitBuf = 0, bitCnt = 0;
         // comp == 2 is grey+alpha (alpha is ignored)
         int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
         int x, y, pos;
-        for(y = 0; y < height; y += 8)
+        for (y = 0; y < height; y += 8)
         {
-            for(x = 0; x < width; x += 8)
+            for (x = 0; x < width; x += 8)
             {
                 float YDU[64], UDU[64], VDU[64];
-                for(row = y, pos = 0; row < y + 8; ++row)
+                for (row = y, pos = 0; row < y + 8; ++row)
                 {
-                    for(col = x; col < x + 8; ++col, ++pos)
+                    for (col = x; col < x + 8; ++col, ++pos)
                     {
                         int p = (stbi__flip_vertically_on_write ? height - 1 - row : row) * width * comp + col * comp;
                         float r, g, b;
-                        if(row >= height)
+                        if (row >= height)
                         {
                             p -= width * comp * (row + 1 - height);
                         }
-                        if(col >= width)
+                        if (col >= width)
                         {
                             p -= comp * (col + 1 - width);
                         }
@@ -1581,14 +1520,14 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x,
 {
     stbi__write_context s;
     stbi__start_write_callbacks(&s, func, context);
-    return stbi_write_jpg_core(&s, x, y, comp, ( void* )data, quality);
+    return stbi_write_jpg_core(&s, x, y, comp, (void*)data, quality);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const void* data, int quality)
 {
     stbi__write_context s;
-    if(stbi__start_write_file(&s, filename))
+    if (stbi__start_write_file(&s, filename))
     {
         int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
         stbi__end_write_file(&s);
@@ -1599,7 +1538,7 @@ STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const
 }
 #endif
 
-#endif    // STB_IMAGE_WRITE_IMPLEMENTATION
+#endif // STB_IMAGE_WRITE_IMPLEMENTATION
 
 /* Revision history
       1.09  (2018-02-11)
diff --git a/examples/common/tengine_operations.c b/examples/common/tengine_operations.c
index 3ee24716c..2ce0d8354 100644
--- a/examples/common/tengine_operations.c
+++ b/examples/common/tengine_operations.c
@@ -71,7 +71,7 @@ image load_image_stb(const char* filename, int channels)
             {
                 int dst_index = i + w * j + w * h * k;
                 int src_index = k + src_c * i + src_c * w * j;
-                im.data[dst_index] = ( float )data[src_index];
+                im.data[dst_index] = (float)data[src_index];
             }
         }
     }
@@ -83,7 +83,7 @@ image load_image_stb(const char* filename, int channels)
 image make_image(int w, int h, int c)
 {
     image out = make_empty_image(w, h, c);
-    out.data = ( float* )calloc((size_t)h * w * c, sizeof(float));
+    out.data = (float*)calloc((size_t)h * w * c, sizeof(float));
     return out;
 }
 
@@ -125,17 +125,17 @@ image imread_process(const char* filename, int img_w, int img_h, float* means, f
 
     switch (choice)
     {
-        case 0:
-            out = gray2bgr(out);
-            break;
-        case 1:
-            out = rgb2gray(out);
-            break;
-        case 2:
-            out = rgb2bgr_permute(out);
-            break;
-        default:
-            break;
+    case 0:
+        out = gray2bgr(out);
+        break;
+    case 1:
+        out = rgb2gray(out);
+        break;
+    case 2:
+        out = rgb2bgr_permute(out);
+        break;
+    default:
+        break;
     }
 
     image resImg = make_image(img_w, img_h, out.c);
@@ -171,8 +171,8 @@ image resize_image(image im, int ow, int oh)
     int h = im.h;
     int w = im.w;
     float shift = 0.f;
-    float _scale_x = ( float )((w - shift) / (ow - shift));
-    float _scale_y = ( float )((h - shift) / (oh - shift));
+    float _scale_x = (float)((w - shift) / (ow - shift));
+    float _scale_y = (float)((h - shift) / (oh - shift));
     float32x4_t scale_x = vdupq_n_f32(_scale_x);
     float offset = 0.5;
     int in_hw = h * w;
@@ -215,8 +215,7 @@ image resize_image(image im, int ow, int oh)
 
                 float32x4_t fx_0 = vsubq_f32(offset_1, fx);
 
-                const int32x4_t in_idx =
-                    vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0));
+                const int32x4_t in_idx = vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0));
                 int32x4_t in_index0 = in_idx;
                 int32x4_t in_index2 = vaddq_s32(in_idx, vcvtq_s32_f32(offset_1));
                 int32x4_t in_index1 = vaddq_s32(in_idx, w_0);
@@ -290,8 +289,8 @@ image resize_image(image im, int ow, int oh)
     int h = im.h;
     int w = im.w;
     float shift = 0.f;
-    float _scale_x = ( float )((w - shift) / (ow - shift));
-    float _scale_y = ( float )((h - shift) / (oh - shift));
+    float _scale_x = (float)((w - shift) / (ow - shift));
+    float _scale_y = (float)((h - shift) / (oh - shift));
 
     float32x4_t scale_x = vdupq_n_f32(_scale_x);
     float offset = 0.5;
@@ -335,8 +334,7 @@ image resize_image(image im, int ow, int oh)
 
                 float32x4_t fx_0 = vsubq_f32(offset_1, fx);
 
-                const int32x4_t in_idx =
-                    vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0));
+                const int32x4_t in_idx = vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0));
 
                 int32x4_t in_index0 = in_idx;
                 int32x4_t in_index2 = vaddq_s32(in_idx, vcvtq_s32_f32(offset_1));
@@ -408,8 +406,8 @@ image resize_image(image im, int ow, int oh)
 #endif
 
 #else
-    float scale_x = ( float )(im.w) / (ow);
-    float scale_y = ( float )(im.h) / (oh);
+    float scale_x = (float)(im.w) / (ow);
+    float scale_y = (float)(im.h) / (oh);
     int w = im.w;
     int h = im.h;
     int in_hw = h * w;
@@ -480,14 +478,14 @@ image copyMaker(image im, int top, int bottom, int left, int right, float value)
 
 void save_image(image im, const char* name)
 {
-    char buff[256] = { 0 };
-    unsigned char* data = ( unsigned char* )calloc((size_t)im.w * im.h * im.c, sizeof(char));
+    char buff[256] = {0};
+    unsigned char* data = (unsigned char*)calloc((size_t)im.w * im.h * im.c, sizeof(char));
     int i, k;
     for (k = 0; k < im.c; ++k)
     {
         for (i = 0; i < im.w * im.h; ++i)
         {
-            data[i * im.c + k] = ( unsigned char )(im.data[i + k * im.w * im.h]);
+            data[i * im.c + k] = (unsigned char)(im.data[i + k * im.w * im.h]);
         }
     }
 
@@ -506,22 +504,22 @@ void save_image(image im, const char* name)
 
     switch (f)
     {
-        case 0:
-            strcat(buff, ".jpg");
-        case 1:
-            success = stbi_write_jpg(buff, im.w, im.h, im.c, data, 80);
-            break;
-        case 2:
-            success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w * im.c);
-            break;
-        case 3:
-            success = stbi_write_tga(buff, im.w, im.h, im.c, data);
-            break;
-        case 4:
-            success = stbi_write_bmp(buff, im.w, im.h, im.c, data);
-            break;
-        default:
-            return;
+    case 0:
+        strcat(buff, ".jpg");
+    case 1:
+        success = stbi_write_jpg(buff, im.w, im.h, im.c, data, 80);
+        break;
+    case 2:
+        success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w * im.c);
+        break;
+    case 3:
+        success = stbi_write_tga(buff, im.w, im.h, im.c, data);
+        break;
+    case 4:
+        success = stbi_write_bmp(buff, im.w, im.h, im.c, data);
+        break;
+    default:
+        return;
     }
     free(data);
     if (!success)
@@ -586,7 +584,7 @@ static float get_pixelBychannel(image m, int x, int y, int c)
 image copy_image(image p)
 {
     image copy = p;
-    copy.data = ( float* )calloc((size_t)p.h * p.w * p.c, sizeof(float));
+    copy.data = (float*)calloc((size_t)p.h * p.w * p.c, sizeof(float));
     memcpy(copy.data, p.data, (unsigned long)p.h * p.w * p.c * sizeof(float));
     return copy;
 }
@@ -642,7 +640,8 @@ image imread2post(const char* filename)
 {
     image im = load_image_stb(filename, 0);
     const int len = im.c * im.h * im.w;
-    for (int i = 0; i < len; ++i) {
+    for (int i = 0; i < len; ++i)
+    {
         im.data[i] *= 255;
     }
     return im;
@@ -651,20 +650,21 @@ image imread2post(const char* filename)
 image rgb2bgr_permute(image src)
 {
     const int len = src.c * src.h * src.w;
-    float* GRB = ( float* )malloc(sizeof(float) * len);
+    float* GRB = (float*)malloc(sizeof(float) * len);
     for (int c = 0; c < src.c; c++)
     {
         for (int h = 0; h < src.h; h++)
         {
             for (int w = 0; w < src.w; w++)
             {
-                int newIndex = ( c )*src.h * src.w + h * src.w + w;
+                int newIndex = (c)*src.h * src.w + h * src.w + w;
                 int grbIndex = (2 - c) * src.h * src.w + h * src.w + w;
                 GRB[grbIndex] = src.data[newIndex];
             }
         }
     }
-    for (int i = 0; i < len; ++i) {
+    for (int i = 0; i < len; ++i)
+    {
         src.data[i] = GRB[i];
     }
     free(GRB);
@@ -673,14 +673,14 @@ image rgb2bgr_permute(image src)
 
 image image_permute(image src)
 {
-    float* GRB = ( float* )malloc(sizeof(float) * src.c * src.h * src.w);
+    float* GRB = (float*)malloc(sizeof(float) * src.c * src.h * src.w);
     for (int c = 0; c < src.c; c++)
     {
         for (int h = 0; h < src.h; h++)
         {
             for (int w = 0; w < src.w; w++)
             {
-                int newIndex = ( c )*src.h * src.w + h * src.w + w;
+                int newIndex = (c)*src.h * src.w + h * src.w + w;
                 int grbIndex = (2 - c) * src.h * src.w + h * src.w + w;
                 GRB[grbIndex] = src.data[newIndex];
             }
@@ -696,7 +696,7 @@ image gray2bgr(image src)
     res.c = 3;
     res.h = src.h;
     res.w = src.w;
-    res.data = ( float* )malloc(sizeof(float) * 3 * src.h * src.w);
+    res.data = (float*)malloc(sizeof(float) * 3 * src.h * src.w);
     for (int x = 0; x < src.h; x++)
     {
         for (int y = 0; y < src.w; y++)
@@ -714,7 +714,7 @@ image gray2bgr(image src)
 image tranpose(image src)
 {
     int size = src.c * src.h * src.w;
-    float* tempData = ( float* )malloc(sizeof(float) * size);
+    float* tempData = (float*)malloc(sizeof(float) * size);
     int index = 0;
 
     for (int c = 0; c < src.c; c++)
@@ -811,7 +811,7 @@ image rgb2gray(image src)
     res.h = src.h;
     res.w = src.w;
     res.c = 1;
-    res.data = ( float* )malloc(sizeof(float) * res.h * res.w);
+    res.data = (float*)malloc(sizeof(float) * res.h * res.w);
     for (int i = 0; i < res.h; i++)
     {
         for (int j = 0; j < res.w; j++)
@@ -838,7 +838,7 @@ image letterbox(image im, int w, int h)
 {
     int ow = im.w;
     int oh = im.h;
-    if ((( float )w / im.w) < (( float )h / im.h))
+    if (((float)w / im.w) < ((float)h / im.h))
     {
         ow = w;
         oh = (im.h * w) / im.w;
@@ -853,7 +853,7 @@ image letterbox(image im, int w, int h)
     boxed.w = w;
     boxed.h = h;
     boxed.c = im.c;
-    boxed.data = ( float* )malloc(sizeof(float) * im.c * h * w);
+    boxed.data = (float*)malloc(sizeof(float) * im.c * h * w);
 
     for (int i = 0; i < boxed.c * boxed.h * boxed.w; i++)
     {
@@ -868,20 +868,20 @@ image letterbox(image im, int w, int h)
 
 void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, int w)
 {
-    float _scale_x = ( float )(w) / ( float )(ow);
-    float _scale_y = ( float )(h) / ( float )(oh);
+    float _scale_x = (float)(w) / (float)(ow);
+    float _scale_y = (float)(h) / (float)(oh);
     float offset = 0.5f;
 
-    int16_t* buf = ( int16_t* )malloc((ow + ow + ow + oh + oh + oh) * sizeof(int16_t));
-    int16_t* xCoef = ( int16_t* )(buf);
-    int16_t* xPos = ( int16_t* )(buf + ow + ow);
-    int16_t* yCoef = ( int16_t* )(buf + ow + ow + ow);
-    int16_t* yPos = ( int16_t* )(buf + ow + ow + ow + oh + oh);
+    int16_t* buf = (int16_t*)malloc((ow + ow + ow + oh + oh + oh) * sizeof(int16_t));
+    int16_t* xCoef = (int16_t*)(buf);
+    int16_t* xPos = (int16_t*)(buf + ow + ow);
+    int16_t* yCoef = (int16_t*)(buf + ow + ow + ow);
+    int16_t* yPos = (int16_t*)(buf + ow + ow + ow + oh + oh);
 
     for (int i = 0; i < ow; i++)
     {
-        float fx = ( float )((( float )i + offset) * _scale_x - offset);
-        int sx = ( int )fx;
+        float fx = (float)(((float)i + offset) * _scale_x - offset);
+        int sx = (int)fx;
         fx -= sx;
         if (sx < 0)
         {
@@ -900,8 +900,8 @@ void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, i
 
     for (int j = 0; j < oh; j++)
     {
-        float fy = ( float )((( float )j + offset) * _scale_y - offset);
-        int sy = ( int )fy;
+        float fy = (float)(((float)j + offset) * _scale_y - offset);
+        int sy = (int)fy;
         fy -= sy;
         if (sy < 0)
         {
@@ -919,7 +919,7 @@ void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, i
     }
 
     //    int32_t* row = new int32_t[ow + ow];
-    int32_t* row = ( int32_t* )malloc((ow + ow) * sizeof(int32_t));
+    int32_t* row = (int32_t*)malloc((ow + ow) * sizeof(int32_t));
 
     for (int k = 0; k < c; k++)
     {
@@ -1019,7 +1019,7 @@ static void sort_cls_score(cls_score* array, int left, int right)
 
 void print_topk(float* data, int total_num, int topk)
 {
-    cls_score* cls_scores = ( cls_score* )malloc(total_num * sizeof(cls_score));
+    cls_score* cls_scores = (cls_score*)malloc(total_num * sizeof(cls_score));
     for (int i = 0; i < total_num; i++)
     {
         cls_scores[i].id = i;
diff --git a/examples/common/test_nnie_all.hpp b/examples/common/test_nnie_all.hpp
index e10151554..4c55dbe73 100644
--- a/examples/common/test_nnie_all.hpp
+++ b/examples/common/test_nnie_all.hpp
@@ -33,29 +33,29 @@
 #include "mpi_nnie.h"
 
 /*16Byte align*/
-#define TEST_NNIE_ALIGN_16 16
-#define TEST_NNIE_ALIGN16(u32Num) ((u32Num + TEST_NNIE_ALIGN_16 - 1) / TEST_NNIE_ALIGN_16 * TEST_NNIE_ALIGN_16)
-#define TEST_NNIE_COORDI_NUM 4     /*coordinate numbers*/
-#define TEST_NNIE_QUANT_BASE 4096  /*the base value*/
-#define TEST_NNIE_PROPOSAL_WIDTH 6 /*the number of proposal values*/
-#define TEST_NNIE_SSD_REPORT_NODE_NUM 12
-#define TEST_NNIE_MAX_SOFTWARE_MEM_NUM 4
-#define TEST_NNIE_SSD_REPORT_NODE_NUM 12
-#define TEST_NNIE_SSD_PRIORBOX_NUM 6
-#define TEST_NNIE_SSD_SOFTMAX_NUM 6
-#define TEST_NNIE_SSD_ASPECT_RATIO_NUM 6
-#define TEST_NNIE_YOLOV3_REPORT_BLOB_NUM 3             /*yolov3 report blob num*/
+#define TEST_NNIE_ALIGN_16                          16
+#define TEST_NNIE_ALIGN16(u32Num)                   ((u32Num + TEST_NNIE_ALIGN_16 - 1) / TEST_NNIE_ALIGN_16 * TEST_NNIE_ALIGN_16)
+#define TEST_NNIE_COORDI_NUM                        4    /*coordinate numbers*/
+#define TEST_NNIE_QUANT_BASE                        4096 /*the base value*/
+#define TEST_NNIE_PROPOSAL_WIDTH                    6    /*the number of proposal values*/
+#define TEST_NNIE_SSD_REPORT_NODE_NUM               12
+#define TEST_NNIE_MAX_SOFTWARE_MEM_NUM              4
+#define TEST_NNIE_SSD_REPORT_NODE_NUM               12
+#define TEST_NNIE_SSD_PRIORBOX_NUM                  6
+#define TEST_NNIE_SSD_SOFTMAX_NUM                   6
+#define TEST_NNIE_SSD_ASPECT_RATIO_NUM              6
+#define TEST_NNIE_YOLOV3_REPORT_BLOB_NUM            3  /*yolov3 report blob num*/
 #define TEST_NNIE_YOLOV3_EACH_BBOX_INFER_RESULT_NUM 85 /*yolov3 inference result num of each bbox*/
-#define TEST_NNIE_YOLOV3_EACH_GRID_BIAS_NUM 6          /*yolov3 bias num of each grid*/
-#define TEST_NNIE_SCORE_NUM 2                          /*the num of RPN scores*/
+#define TEST_NNIE_YOLOV3_EACH_GRID_BIAS_NUM         6  /*yolov3 bias num of each grid*/
+#define TEST_NNIE_SCORE_NUM                         2  /*the num of RPN scores*/
 
-#define TEST_NNIE_COORDI_NUM 4 /*coordinate numbers*/
-#define TEST_COORDI_NUM 4      /*num of coordinates*/
-#define TEST_NNIE_HALF 0.5f    /*the half value*/
-#define TEST_NNIE_MAX(a, b) (((a) > (b)) ? (a) : (b))
-#define TEST_NNIE_MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define TEST_NNIE_COORDI_NUM 4    /*coordinate numbers*/
+#define TEST_COORDI_NUM      4    /*num of coordinates*/
+#define TEST_NNIE_HALF       0.5f /*the half value*/
+#define TEST_NNIE_MAX(a, b)  (((a) > (b)) ? (a) : (b))
+#define TEST_NNIE_MIN(a, b)  (((a) < (b)) ? (a) : (b))
 
-#define TEST_NNIE_SIGMOID(x) (HI_FLOAT)(1.0f / (1 + fast_exp(-x)))
+#define TEST_NNIE_SIGMOID(x)       (HI_FLOAT)(1.0f / (1 + fast_exp(-x)))
 #define TEST_NNIE_SIGMOID_NOEXP(x) (HI_FLOAT)(1.0f / (1 + x))
 
 inline float32x4_t vexpq10_f32(float32x4_t x)
@@ -74,7 +74,7 @@ inline float32x4_t vexpq10_f32(float32x4_t x)
     return x;
 }
 
-void fast_exp_4f(const float *a, float *xx)
+void fast_exp_4f(const float* a, float* xx)
 {
     float32x4_t x = vld1q_f32(a);
     x = vexpq10_f32(x);
@@ -120,14 +120,14 @@ typedef struct hiTEST_NNIE_FASTERRCNN_SOFTWARE_PARAM_S
     HI_U32 u32ClassNum;
     HI_U32 au32ConfThresh[21];
     HI_U32 u32ValidNmsThresh;
-    HI_S32 *aps32Conv[2];
+    HI_S32* aps32Conv[2];
     SVP_MEM_INFO_S stRpnTmpBuf;
     SVP_DST_BLOB_S stRpnBbox;
     SVP_DST_BLOB_S stClassRoiNum;
     SVP_DST_BLOB_S stDstRoi;
     SVP_DST_BLOB_S stDstScore;
     SVP_MEM_INFO_S stGetResultTmpBuf;
-    HI_CHAR *apcRpnDataLayerName[2];
+    HI_CHAR* apcRpnDataLayerName[2];
 } TEST_NNIE_FASTERRCNN_SOFTWARE_PARAM_S;
 
 typedef struct hiTEST_NNIE_CNN_GETTOPN_UNIT_S
@@ -270,7 +270,7 @@ typedef struct hiTEST_NNIE_STACK
     HI_S32 s32Max;
 } TEST_NNIE_STACK_S;
 
-HI_S32 SAMPLE_COMM_SVP_MallocMem(const HI_CHAR *pszMmb, const HI_CHAR *pszZone, HI_U64 *pu64PhyAddr, HI_VOID **ppvVirAddr, HI_U32 u32Size)
+HI_S32 SAMPLE_COMM_SVP_MallocMem(const HI_CHAR* pszMmb, const HI_CHAR* pszZone, HI_U64* pu64PhyAddr, HI_VOID** ppvVirAddr, HI_U32 u32Size)
 {
     HI_S32 s32Ret = HI_SUCCESS;
 
diff --git a/examples/cpp_tm_classification.cpp b/examples/cpp_tm_classification.cpp
index f5cb0d3a7..d4451f1cb 100644
--- a/examples/cpp_tm_classification.cpp
+++ b/examples/cpp_tm_classification.cpp
@@ -35,15 +35,15 @@
 #include "tengine_cpp_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 227
-#define DEFAULT_IMG_W 227
-#define DEFAULT_SCALE1 1.f
-#define DEFAULT_SCALE2 1.f
-#define DEFAULT_SCALE3 1.f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        227
+#define DEFAULT_IMG_W        227
+#define DEFAULT_SCALE1       1.f
+#define DEFAULT_SCALE2       1.f
+#define DEFAULT_SCALE3       1.f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 
 using namespace std;
@@ -54,7 +54,8 @@ void show_usage()
               << "    [-m model_file] [-l label_file] [-i image_file]\n"
               << "    [-g img_h,img_w] [-s scale] [-w mean[0],mean[1],mean[2]] [-r repeat_count]\n";
 
-    std::cout << "\nmobilenet example: \n" << "    ./classification -m /path/to/mobilenet.tmfile -l /path/to/labels.txt -i /path/to/img.jpg -g 224,224 -s 0.017 -w 104.007,116.669,122.679" << std::endl;
+    std::cout << "\nmobilenet example: \n"
+              << "    ./classification -m /path/to/mobilenet.tmfile -l /path/to/labels.txt -i /path/to/img.jpg -g 224,224 -s 0.017 -w 104.007,116.669,122.679" << std::endl;
 }
 
 int main(int argc, char* argv[])
@@ -74,34 +75,34 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -112,7 +113,7 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-    if(image_file.empty())
+    if (image_file.empty())
     {
         std::cerr << "Error: Image file not specified!" << std::endl;
         show_usage();
@@ -120,15 +121,15 @@ int main(int argc, char* argv[])
     }
 
     // check input files
-    if(!check_file_exist(model_file.c_str()) || !check_file_exist(image_file.c_str()))
+    if (!check_file_exist(model_file.c_str()) || !check_file_exist(image_file.c_str()))
         return -1;
 
-    if(img_h == 0)
+    if (img_h == 0)
     {
         img_h = DEFAULT_IMG_H;
         std::cout << "Image height not specified, use default [" << DEFAULT_IMG_H << "]" << std::endl;
     }
-    if(img_w == 0)
+    if (img_w == 0)
     {
         img_w = DEFAULT_IMG_W;
         std::cout << "Image width not specified, use default [" << DEFAULT_IMG_W << "]" << std::endl;
@@ -140,7 +141,7 @@ int main(int argc, char* argv[])
         scale[2] = DEFAULT_SCALE3;
         std::cout << "Scale value not specified, use default [" << scale[0] << ", " << scale[1] << ", " << scale[2] << "]" << std::endl;
     }
-    if(mean[0] == -1.0 || mean[1] == -1.0 || mean[2] == -1.0)
+    if (mean[0] == -1.0 || mean[1] == -1.0 || mean[2] == -1.0)
     {
         mean[0] = DEFAULT_MEAN1;
         mean[1] = DEFAULT_MEAN2;
@@ -169,7 +170,7 @@ int main(int argc, char* argv[])
 
         /* prepare input data */
         input_tensor.create(1, 3, img_h, img_w);
-        get_input_data(image_file.c_str(), ( float* )input_tensor.data, img_h, img_w, mean, scale);
+        get_input_data(image_file.c_str(), (float*)input_tensor.data, img_h, img_w, mean, scale);
 
         /* forward */
         somenet.input_tensor("data", input_tensor);
@@ -196,7 +197,7 @@ int main(int argc, char* argv[])
         somenet.extract_tensor("prob", output_tensor);
 
         /* after process */
-        print_topk(( float* )output_tensor.data, output_tensor.elem_num, 5);
+        print_topk((float*)output_tensor.data, output_tensor.elem_num, 5);
         std::cout << "--------------------------------------\n";
         std::cout << "ALL TEST DONE\n";
     }
diff --git a/examples/cpp_tm_mobilenet_ssd.cpp b/examples/cpp_tm_mobilenet_ssd.cpp
index b85fe348b..28ad5ceaf 100644
--- a/examples/cpp_tm_mobilenet_ssd.cpp
+++ b/examples/cpp_tm_mobilenet_ssd.cpp
@@ -55,16 +55,16 @@ typedef struct Box
 
 void post_process_ssd(const string image_file, float threshold, const float* outdata, int num)
 {
-    const char* class_names[] = {"background", "aeroplane", "bicycle",   "bird",   "boat",        "bottle",
-                                 "bus",        "car",       "cat",       "chair",  "cow",         "diningtable",
-                                 "dog",        "horse",     "motorbike", "person", "pottedplant", "sheep",
-                                 "sofa",       "train",     "tvmonitor"};
+    const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle",
+                                 "bus", "car", "cat", "chair", "cow", "diningtable",
+                                 "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
+                                 "sofa", "train", "tvmonitor"};
 
     image im = imread(image_file.c_str());
 
     int raw_h = im.h;
     int raw_w = im.w;
-//    struct vector* boxes = create_vector(sizeof(Box_t), nullptr);
+    //    struct vector* boxes = create_vector(sizeof(Box_t), nullptr);
     std::vector<Box_t> boxes;
 
     fprintf(stderr, "detect result num: %d \n", num);
@@ -122,23 +122,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -150,7 +150,7 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-    if(image_file.empty())
+    if (image_file.empty())
     {
         std::cerr << "Error: Image file not specified!" << std::endl;
         show_usage();
@@ -184,7 +184,7 @@ int main(int argc, char* argv[])
 
         /* prepare input data */
         input_tensor.create(1, 3, img_h, img_w);
-        get_input_data(image_file.c_str(), ( float* )input_tensor.data, img_h, img_w, mean, scale);
+        get_input_data(image_file.c_str(), (float*)input_tensor.data, img_h, img_w, mean, scale);
 
         /* forward */
         somenet.input_tensor("data", input_tensor);
@@ -211,7 +211,7 @@ int main(int argc, char* argv[])
         somenet.extract_tensor("detection_out", output_tensor);
 
         /* SSD process */
-        post_process_ssd(image_file, show_threshold, ( float* )output_tensor.data, output_tensor.h);
+        post_process_ssd(image_file, show_threshold, (float*)output_tensor.data, output_tensor.h);
     }
 
     /* release */
diff --git a/examples/tm_alphapose.cpp b/examples/tm_alphapose.cpp
index 4be2296a4..5776993b5 100644
--- a/examples/tm_alphapose.cpp
+++ b/examples/tm_alphapose.cpp
@@ -34,14 +34,14 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 320
-#define DEFAULT_IMG_W 256
-#define DEFAULT_SCALE1 (0.0039216)
-#define DEFAULT_SCALE2 (0.0039215)
-#define DEFAULT_SCALE3 (0.0039215)
-#define DEFAULT_MEAN1 0.406
-#define DEFAULT_MEAN2 0.457
-#define DEFAULT_MEAN3 0.480
+#define DEFAULT_IMG_H        320
+#define DEFAULT_IMG_W        256
+#define DEFAULT_SCALE1       (0.0039216)
+#define DEFAULT_SCALE2       (0.0039215)
+#define DEFAULT_SCALE3       (0.0039215)
+#define DEFAULT_MEAN1        0.406
+#define DEFAULT_MEAN2        0.457
+#define DEFAULT_MEAN3        0.480
 #define DEFAULT_REPEAT_COUNT 1
 #define DEFAULT_THREAD_COUNT 1
 
@@ -51,7 +51,7 @@ using predict_t = std::tuple<cv::Mat, cv::Mat, cv::Mat>;
 
 const float s_keypoint_thresh = 0.2;
 
-cv::Mat get_3rd_point(const cv::Mat & a, const cv::Mat & b)
+cv::Mat get_3rd_point(const cv::Mat& a, const cv::Mat& b)
 {
     auto direct = a - b;
     cv::Mat result(direct.size(), direct.type());
@@ -60,13 +60,13 @@ cv::Mat get_3rd_point(const cv::Mat & a, const cv::Mat & b)
     return result;
 }
 
-cv::Mat get_input_data_pose(const char * img_file_path)
+cv::Mat get_input_data_pose(const char* img_file_path)
 {
     cv::Mat img = cv::imread(img_file_path);
     cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
     img.convertTo(img, CV_32FC3);
 
-    float* img_data = ( float* )img.data;
+    float* img_data = (float*)img.data;
     float means[3]{DEFAULT_MEAN1, DEFAULT_MEAN2, DEFAULT_MEAN3};
     float scales[3]{DEFAULT_SCALE1, DEFAULT_SCALE2, DEFAULT_SCALE3};
 
@@ -85,11 +85,11 @@ cv::Mat get_input_data_pose(const char * img_file_path)
     return std::move(img);
 }
 
-cv::Mat crop_box(const cv::Mat & org_img,
-                           const pt_t & up_left,
-                           const pt_t & bottom_right,
-                           const int & input_res_h,
-                           const int & input_res_w)
+cv::Mat crop_box(const cv::Mat& org_img,
+                 const pt_t& up_left,
+                 const pt_t& bottom_right,
+                 const int& input_res_h,
+                 const int& input_res_w)
 {
     auto img = org_img.clone();
 
@@ -144,16 +144,16 @@ cv::Mat crop_box(const cv::Mat & org_img,
     return std::move(dst_img);
 }
 
-float * pre_process_pose(cv::Mat & img,
-                        const std::vector<bbox_t> & boxes,
-                        std::vector<pt_t> & pt1,
-                        std::vector<pt_t> & pt2)
+float* pre_process_pose(cv::Mat& img,
+                        const std::vector<bbox_t>& boxes,
+                        std::vector<pt_t>& pt1,
+                        std::vector<pt_t>& pt2)
 {
     const int img_height = img.rows;
     const int img_width = img.cols;
 
-    float * predict_data = (float *) malloc (boxes.size() * DEFAULT_IMG_H * DEFAULT_IMG_W * 3 * sizeof(float));
-    float * p_data = predict_data;
+    float* predict_data = (float*)malloc(boxes.size() * DEFAULT_IMG_H * DEFAULT_IMG_W * 3 * sizeof(float));
+    float* p_data = predict_data;
 
     for (size_t i = 0; i < boxes.size(); i++)
     {
@@ -167,10 +167,8 @@ float * pre_process_pose(cv::Mat & img,
         up_left[0] = std::max(0.f, up_left[0] - box_wt * scale_rate / 2);
         up_left[1] = std::max(0.f, up_left[1] - box_ht * scale_rate / 2);
 
-        bottom_right[0] =
-            std::max(std::min(img_width - 1.f, bottom_right[0] + box_wt * scale_rate / 2), up_left[0] + 5);
-        bottom_right[1] =
-            std::max(std::min(img_height - 1.f, bottom_right[1] + box_ht * scale_rate / 2), up_left[1] + 5);
+        bottom_right[0] = std::max(std::min(img_width - 1.f, bottom_right[0] + box_wt * scale_rate / 2), up_left[0] + 5);
+        bottom_right[1] = std::max(std::min(img_height - 1.f, bottom_right[1] + box_ht * scale_rate / 2), up_left[1] + 5);
 
         auto inp = crop_box(img, up_left, bottom_right, DEFAULT_IMG_H, DEFAULT_IMG_W);
         //HWC -> CHW
@@ -192,8 +190,8 @@ float * pre_process_pose(cv::Mat & img,
     return predict_data;
 }
 
-cv::Mat transform_box_invert_batch(cv::Mat & pt,
-                                   const std::vector<pt_t> & ul, const std::vector<pt_t> & br,
+cv::Mat transform_box_invert_batch(cv::Mat& pt,
+                                   const std::vector<pt_t>& ul, const std::vector<pt_t>& br,
                                    const int& input_res_h, const int& input_res_w,
                                    const int& output_res_h, const int& output_res_w)
 {
@@ -204,8 +202,8 @@ cv::Mat transform_box_invert_batch(cv::Mat & pt,
 
     for (size_t i = 0; i < center.size(); i++)
     {
-        auto & len_h_element = len_h[i];
-        auto & len_w_element = len_w[i];
+        auto& len_h_element = len_h[i];
+        auto& len_w_element = len_w[i];
         len_h_element = std::numeric_limits<float>::min();
         for (size_t j = 0; j < std::tuple_size<pt_t>::value; j++)
         {
@@ -221,10 +219,9 @@ cv::Mat transform_box_invert_batch(cv::Mat & pt,
                 len_h_element = size[i][j];
             }
         }
-        len_w_element = len_h_element *  (input_res_w * 1.f / input_res_h);
+        len_w_element = len_h_element * (input_res_w * 1.f / input_res_h);
     }
-    auto clamp_min_func = [](float v, float min = 0.f)
-    {
+    auto clamp_min_func = [](float v, float min = 0.f) {
         if (v < min) return min;
         return v;
     };
@@ -248,16 +245,15 @@ cv::Mat transform_box_invert_batch(cv::Mat & pt,
     return std::move(new_point);
 }
 
-predict_t get_predict(float * hm_data,
+predict_t get_predict(float* hm_data,
                       const int hm_dims[4],
-                      const std::vector<pt_t> & pt1,
-                      const std::vector<pt_t> & pt2,
-                      const int & input_res_h,
-                      const int & input_res_w)
+                      const std::vector<pt_t>& pt1,
+                      const std::vector<pt_t>& pt2,
+                      const int& input_res_h,
+                      const int& input_res_w)
 {
     // Get Keypoint location from heatmap
-    auto get_hm_data = [](float * data, const int data_dims[4], const std::array<int, 4> ele_dims)
-    {
+    auto get_hm_data = [](float* data, const int data_dims[4], const std::array<int, 4> ele_dims) {
         return *(data
                  + ele_dims[0] * data_dims[1] * data_dims[2] * data_dims[3]
                  + ele_dims[1] * data_dims[2] * data_dims[3]
@@ -265,14 +261,14 @@ predict_t get_predict(float * hm_data,
                  + ele_dims[3]);
     };
 
-    cv::Mat preds(hm_dims[0], hm_dims[1],  CV_32FC2);
+    cv::Mat preds(hm_dims[0], hm_dims[1], CV_32FC2);
     cv::Mat maxval(hm_dims[0], hm_dims[1], CV_32FC1);
 
     for (int i = 0; i < hm_dims[0]; i++)
     {
         for (int j = 0; j < hm_dims[1]; j++)
         {
-            float * start_iter = hm_data + i * hm_dims[1] * hm_dims[2] * hm_dims[3] + j * hm_dims[2] * hm_dims[3];
+            float* start_iter = hm_data + i * hm_dims[1] * hm_dims[2] * hm_dims[3] + j * hm_dims[2] * hm_dims[3];
             auto max_element = std::max_element(start_iter, start_iter + hm_dims[2] * hm_dims[3]);
             preds.ptr<cv::Vec2f>(i, j)->val[0] = preds.ptr<cv::Vec2f>(i, j)->val[1] = std::distance(start_iter, max_element) + 1;
             maxval.at<float>(i, j) = *max_element;
@@ -301,10 +297,11 @@ predict_t get_predict(float * hm_data,
                 && (0 < pY)
                 && (pY < (hm_dims[3] - 1)))
             {
-                auto sign_func = [](float x)
-                {
-                    if (x > 0.) x = 1.f;
-                    else if (x < 0.) x = -1.f;
+                auto sign_func = [](float x) {
+                    if (x > 0.)
+                        x = 1.f;
+                    else if (x < 0.)
+                        x = -1.f;
                     return x;
                 };
 
@@ -322,8 +319,8 @@ predict_t get_predict(float * hm_data,
     return std::make_tuple(preds, preds_tf, maxval);
 }
 
-void post_process_pose(const char * image_file,
-                       float * heatmap_data, int heatmap_dims[4],
+void post_process_pose(const char* image_file,
+                       float* heatmap_data, int heatmap_dims[4],
                        const std::vector<pt_t>& pt1, const std::vector<pt_t>& pt2)
 {
     cv::Mat preds_hm, preds_scores;
@@ -348,7 +345,7 @@ void show_usage()
     fprintf(stderr, "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
 }
 
-bool tengine_predict(float * input_data, graph_t graph, const int input_dims[4], const int & num_thread, const int & loop_count)
+bool tengine_predict(float* input_data, graph_t graph, const int input_dims[4], const int& num_thread, const int& loop_count)
 {
     /* set runtime options */
     struct options opt;
@@ -423,23 +420,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -482,13 +479,13 @@ int main(int argc, char* argv[])
     int img_width = input_tensor.cols;
 
     // support multi-roi boxes later
-    std::vector<bbox_t> boxes {{0,0, static_cast<float>(img_width - 1), static_cast<float>(img_height - 1)}};
+    std::vector<bbox_t> boxes{{0, 0, static_cast<float>(img_width - 1), static_cast<float>(img_height - 1)}};
     std::vector<pt_t> pt1, pt2;
     pt1.resize(boxes.size());
     pt2.resize(boxes.size());
 
     // pre-process
-    float * input_data = pre_process_pose(input_tensor, boxes, pt1, pt2);
+    float* input_data = pre_process_pose(input_tensor, boxes, pt1, pt2);
     int input_dims[] = {static_cast<int>(boxes.size()), 3, DEFAULT_IMG_H, DEFAULT_IMG_W}; // nchw
 
     // run prediction
@@ -503,7 +500,7 @@ int main(int argc, char* argv[])
     int heatmap_dims[MAX_SHAPE_DIM_NUM] = {0};
     get_tensor_shape(output_tensor, heatmap_dims, MAX_SHAPE_DIM_NUM);
 
-    post_process_pose(image_file, (float *)get_tensor_buffer(output_tensor), heatmap_dims, pt1, pt2);
+    post_process_pose(image_file, (float*)get_tensor_buffer(output_tensor), heatmap_dims, pt1, pt2);
 
     if (input_data)
     {
diff --git a/examples/tm_classification.c b/examples/tm_classification.c
index ceda33270..f4d33d878 100644
--- a/examples/tm_classification.c
+++ b/examples/tm_classification.c
@@ -29,15 +29,15 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 224
-#define DEFAULT_IMG_W 224
-#define DEFAULT_SCALE1 0.017f
-#define DEFAULT_SCALE2 0.017f
-#define DEFAULT_SCALE3 0.017f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        224
+#define DEFAULT_IMG_W        224
+#define DEFAULT_SCALE1       0.017f
+#define DEFAULT_SCALE2       0.017f
+#define DEFAULT_SCALE3       0.017f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 #define DEFAULT_CPU_AFFINITY 255
 
@@ -69,8 +69,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -89,7 +89,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -131,7 +131,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     print_topk(output_data, output_size, 5);
@@ -176,37 +176,37 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_classification_acl.c b/examples/tm_classification_acl.c
index f6a1cbff5..0d1fbc4c4 100644
--- a/examples/tm_classification_acl.c
+++ b/examples/tm_classification_acl.c
@@ -29,15 +29,15 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 227
-#define DEFAULT_IMG_W 227
-#define DEFAULT_SCALE1 1.f
-#define DEFAULT_SCALE2 1.f
-#define DEFAULT_SCALE3 1.f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        227
+#define DEFAULT_IMG_W        227
+#define DEFAULT_SCALE1       1.f
+#define DEFAULT_SCALE2       1.f
+#define DEFAULT_SCALE3       1.f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 #define DEFAULT_CPU_AFFINITY 255
 
@@ -78,8 +78,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -98,7 +98,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -140,7 +140,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     print_topk(output_data, output_size, 5);
@@ -185,37 +185,37 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_classification_cuda.cpp b/examples/tm_classification_cuda.cpp
index 7166305ab..943643564 100644
--- a/examples/tm_classification_cuda.cpp
+++ b/examples/tm_classification_cuda.cpp
@@ -29,15 +29,15 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 227
-#define DEFAULT_IMG_W 227
-#define DEFAULT_SCALE1 1.f
-#define DEFAULT_SCALE2 1.f
-#define DEFAULT_SCALE3 1.f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        227
+#define DEFAULT_IMG_W        227
+#define DEFAULT_SCALE1       1.f
+#define DEFAULT_SCALE2       1.f
+#define DEFAULT_SCALE3       1.f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 #define DEFAULT_CPU_AFFINITY 255
 
@@ -58,7 +58,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
         return -1;
     }
     fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
-	
+
     /* create NVIDIA CUDA backend */
     context_t cuda_context = create_context("cuda", 1);
     int rtt = add_context_device(cuda_context, "CUDA");
@@ -66,7 +66,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
     {
         fprintf(stderr, " add_context_device NV CUDA DEVICE failed.\n");
         return -1;
-    }	
+    }
 
     /* create graph, load tengine model xxx.tmfile */
     graph_t graph = create_graph(cuda_context, "tengine", model_file);
@@ -78,8 +78,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -98,7 +98,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -140,7 +140,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     print_topk(output_data, output_size, 5);
@@ -185,37 +185,37 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_classification_fp16.c b/examples/tm_classification_fp16.c
index 3108679e6..870fb5756 100644
--- a/examples/tm_classification_fp16.c
+++ b/examples/tm_classification_fp16.c
@@ -30,22 +30,22 @@
 #include "tengine_operations.h"
 #include "compiler_fp16.h"
 
-#define DEFAULT_IMG_H 227
-#define DEFAULT_IMG_W 227
-#define DEFAULT_SCALE1 1.f
-#define DEFAULT_SCALE2 1.f
-#define DEFAULT_SCALE3 1.f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        227
+#define DEFAULT_IMG_W        227
+#define DEFAULT_SCALE1       1.f
+#define DEFAULT_SCALE2       1.f
+#define DEFAULT_SCALE3       1.f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 
 void get_input_fp16_data(const char* image_file, __fp16* input_data, int img_h, int img_w, float* mean, float* scale)
 {
     image img = imread_process(image_file, img_w, img_h, mean, scale);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
         input_data[i] = fp32_to_fp16(image_data[i]);
@@ -81,8 +81,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    __fp16* input_data = ( __fp16* )malloc(img_size * sizeof(__fp16));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    __fp16* input_data = (__fp16*)malloc(img_size * sizeof(__fp16));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -141,11 +141,11 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    __fp16* output_fp16 = ( __fp16* )get_tensor_buffer(output_tensor);
+    __fp16* output_fp16 = (__fp16*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(__fp16);
 
     /* cast fp16 to fp32 */
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
         output_data[i] = fp16_to_fp32(output_fp16[i]);
 
@@ -193,34 +193,34 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_classification_int8.c b/examples/tm_classification_int8.c
index 734b68a09..098a2fa3e 100644
--- a/examples/tm_classification_int8.c
+++ b/examples/tm_classification_int8.c
@@ -29,23 +29,23 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 224
-#define DEFAULT_IMG_W 224
-#define DEFAULT_SCALE1 0.017f
-#define DEFAULT_SCALE2 0.017f
-#define DEFAULT_SCALE3 0.017f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        224
+#define DEFAULT_IMG_W        224
+#define DEFAULT_SCALE1       0.017f
+#define DEFAULT_SCALE2       0.017f
+#define DEFAULT_SCALE3       0.017f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 
 void get_input_int8_data(const char* image_file, int8_t* input_data, int img_h, int img_w, float* mean, float* scale,
-                          float input_scale)
+                         float input_scale)
 {
     image img = imread_process(image_file, img_w, img_h, mean, scale);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
     {
@@ -89,8 +89,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    int8_t* input_data = ( int8_t* )malloc(img_size);
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    int8_t* input_data = (int8_t*)malloc(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -152,16 +152,16 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    int8_t* output_i8 = ( int8_t* )get_tensor_buffer(output_tensor);
+    int8_t* output_i8 = (int8_t*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor);
 
     /* dequant */
     float output_scale = 0.f;
     int output_zero_point = 0;
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
-        output_data[i] = ( float )output_i8[i]* output_scale;
+        output_data[i] = (float)output_i8[i] * output_scale;
 
     print_topk(output_data, output_size, 5);
     fprintf(stderr, "--------------------------------------\n");
@@ -207,34 +207,34 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_classification_timvx.c b/examples/tm_classification_timvx.c
index b759c8ab0..4d81c25d4 100644
--- a/examples/tm_classification_timvx.c
+++ b/examples/tm_classification_timvx.c
@@ -29,15 +29,15 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 227
-#define DEFAULT_IMG_W 227
-#define DEFAULT_SCALE1 1.f
-#define DEFAULT_SCALE2 1.f
-#define DEFAULT_SCALE3 1.f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        227
+#define DEFAULT_IMG_W        227
+#define DEFAULT_SCALE1       1.f
+#define DEFAULT_SCALE2       1.f
+#define DEFAULT_SCALE3       1.f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 #define DEFAULT_CPU_AFFINITY 255
 
@@ -46,7 +46,7 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h
 {
     image img = imread_process(image_file, img_w, img_h, mean, scale);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
     {
@@ -63,7 +63,7 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h
 }
 
 int tengine_classify(const char* model_file, const char* image_file, int img_h, int img_w, float* mean, float* scale,
-					  int loop_count, int num_thread, int affinity)
+                     int loop_count, int num_thread, int affinity)
 {
     /* set runtime options */
     struct options opt;
@@ -99,8 +99,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    uint8_t* input_data = ( uint8_t* )malloc(img_size);
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    uint8_t* input_data = (uint8_t*)malloc(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -119,7 +119,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -164,16 +164,16 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor);
+    uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor);
 
     /* dequant */
     float output_scale = 0.f;
     int output_zero_point = 0;
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     print_topk(output_data, output_size, 5);
     fprintf(stderr, "--------------------------------------\n");
@@ -218,37 +218,37 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_classification_trt.cpp b/examples/tm_classification_trt.cpp
index 64d0cd861..4ebed2402 100644
--- a/examples/tm_classification_trt.cpp
+++ b/examples/tm_classification_trt.cpp
@@ -29,19 +29,18 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 227
-#define DEFAULT_IMG_W 227
-#define DEFAULT_SCALE1 1.f
-#define DEFAULT_SCALE2 1.f
-#define DEFAULT_SCALE3 1.f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        227
+#define DEFAULT_IMG_W        227
+#define DEFAULT_SCALE1       1.f
+#define DEFAULT_SCALE2       1.f
+#define DEFAULT_SCALE3       1.f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 #define DEFAULT_CPU_AFFINITY 255
 
-
 int tengine_classify(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean,
                      const float* scale, int loop_count, int num_thread, int affinity)
 {
@@ -59,7 +58,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
         return -1;
     }
     fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
-	
+
     /* create NVIDIA TensorRT backend */
     context_t trt_context = create_context("trt", 1);
     int rtt = add_context_device(trt_context, "TensorRT");
@@ -79,8 +78,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -99,7 +98,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -141,7 +140,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     print_topk(output_data, output_size, 5);
@@ -186,37 +185,37 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_classification_uint8.c b/examples/tm_classification_uint8.c
index 1a59bfdc7..dbf11c32e 100644
--- a/examples/tm_classification_uint8.c
+++ b/examples/tm_classification_uint8.c
@@ -29,15 +29,15 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 224
-#define DEFAULT_IMG_W 224
-#define DEFAULT_SCALE1 0.017f
-#define DEFAULT_SCALE2 0.017f
-#define DEFAULT_SCALE3 0.017f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        224
+#define DEFAULT_IMG_W        224
+#define DEFAULT_SCALE1       0.017f
+#define DEFAULT_SCALE2       0.017f
+#define DEFAULT_SCALE3       0.017f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 
 void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h, int img_w, float* mean, float* scale,
@@ -45,7 +45,7 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h
 {
     image img = imread_process(image_file, img_w, img_h, mean, scale);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
     {
@@ -89,8 +89,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    uint8_t* input_data = ( uint8_t* )malloc(img_size);
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    uint8_t* input_data = (uint8_t*)malloc(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -109,7 +109,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -154,16 +154,16 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor);
+    uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor);
 
     /* dequant */
     float output_scale = 0.f;
     int output_zero_point = 0;
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     print_topk(output_data, output_size, 5);
     fprintf(stderr, "--------------------------------------\n");
@@ -207,34 +207,34 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_classification_vulkan.c b/examples/tm_classification_vulkan.c
index 97960681e..792e3ac32 100644
--- a/examples/tm_classification_vulkan.c
+++ b/examples/tm_classification_vulkan.c
@@ -29,15 +29,15 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 227
-#define DEFAULT_IMG_W 227
-#define DEFAULT_SCALE1 1.f
-#define DEFAULT_SCALE2 1.f
-#define DEFAULT_SCALE3 1.f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        227
+#define DEFAULT_IMG_W        227
+#define DEFAULT_SCALE1       1.f
+#define DEFAULT_SCALE2       1.f
+#define DEFAULT_SCALE3       1.f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 
 int tengine_classify(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean,
@@ -72,8 +72,8 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -132,7 +132,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     print_topk(output_data, output_size, 5);
@@ -178,34 +178,34 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_crnn.cpp b/examples/tm_crnn.cpp
index c7c1ed0b7..ee3e2b1ed 100644
--- a/examples/tm_crnn.cpp
+++ b/examples/tm_crnn.cpp
@@ -57,7 +57,7 @@ void get_input_data_cv(const cv::Mat& sample, float* input_data, int img_h, int
     {
         cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB);
     }
-    else if (sample.channels() == 3 && img_c == 3  && swapRB == 1)
+    else if (sample.channels() == 3 && img_c == 3 && swapRB == 1)
     {
         cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
     }
@@ -75,7 +75,7 @@ void get_input_data_cv(const cv::Mat& sample, float* input_data, int img_h, int
         img.convertTo(img, CV_32FC3);
     else if (img_c == 1)
         img.convertTo(img, CV_32FC1);
-    float* img_data = ( float* )img.data;
+    float* img_data = (float*)img.data;
     int hw = img_h * img_w;
     for (int h = 0; h < img_h; h++)
     {
@@ -158,26 +158,26 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'l':
-                label_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'l':
+            label_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -238,7 +238,7 @@ int main(int argc, char* argv[])
 
     int img_size = img_h * img_w * 1;
     int dims[] = {1, 1, img_h, img_w};
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -293,7 +293,7 @@ int main(int argc, char* argv[])
 
     /* process the crnn result */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* ocr_data = ( float* )get_tensor_buffer(output_tensor);
+    float* ocr_data = (float*)get_tensor_buffer(output_tensor);
     process_crnn_result(ocr_data, label_file);
 
     free(input_data);
diff --git a/examples/tm_efficientdet.c b/examples/tm_efficientdet.c
index 73daeeeb0..26ed21f55 100644
--- a/examples/tm_efficientdet.c
+++ b/examples/tm_efficientdet.c
@@ -30,19 +30,18 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 512
-#define DEFAULT_IMG_W 512
-#define DEFAULT_SCALE1 0.017124754f
-#define DEFAULT_SCALE2 0.017507003f
-#define DEFAULT_SCALE3 0.017429194f
-#define DEFAULT_MEAN1 123.675
-#define DEFAULT_MEAN2 116.280
-#define DEFAULT_MEAN3 103.530
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        512
+#define DEFAULT_IMG_W        512
+#define DEFAULT_SCALE1       0.017124754f
+#define DEFAULT_SCALE2       0.017507003f
+#define DEFAULT_SCALE3       0.017429194f
+#define DEFAULT_MEAN1        123.675
+#define DEFAULT_MEAN2        116.280
+#define DEFAULT_MEAN3        103.530
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 #define DEFAULT_CPU_AFFINITY 255
 
-
 typedef struct Box
 {
     int x0;
@@ -53,20 +52,22 @@ typedef struct Box
     float score;
 } Box_t;
 
-
-void qsort_descent_inplace(Box_t* boxes, int left, int right) {
+void qsort_descent_inplace(Box_t* boxes, int left, int right)
+{
     int i = left;
     int j = right;
     float p = boxes[(left + right) / 2].score;
 
-    while (i <= j) {
+    while (i <= j)
+    {
         while (boxes[i].score > p)
             i++;
 
         while (boxes[j].score < p)
             j--;
 
-        if (i <= j) {
+        if (i <= j)
+        {
             // swap
             Box_t tmp = boxes[i];
             boxes[i] = boxes[j];
@@ -90,23 +91,26 @@ void qsort_descent_inplace(Box_t* boxes, int left, int right) {
     }
 }
 
-
-int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_threshold) {
+int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_threshold)
+{
     int num_outputs = num_boxes;
 
     float* areas = malloc(num_boxes * sizeof(float));
 
-    for (int i = 0; i < num_boxes; i++) {
-        areas[i] = (float) ((boxes[i].x1 - boxes[i].x0) * (boxes[i].y1 - boxes[i].y0));
+    for (int i = 0; i < num_boxes; i++)
+    {
+        areas[i] = (float)((boxes[i].x1 - boxes[i].x0) * (boxes[i].y1 - boxes[i].y0));
     }
 
-    for (int i = 0; i < num_boxes; i++) {
+    for (int i = 0; i < num_boxes; i++)
+    {
         const Box_t a = boxes[i];
 
         if (suppressed[i] == 1)
             continue;
 
-        for (int j = i + 1; j < num_boxes; j++) {
+        for (int j = i + 1; j < num_boxes; j++)
+        {
             const Box_t b = boxes[j];
 
             if (suppressed[j] == 1)
@@ -117,10 +121,13 @@ int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_thre
             float total_area = (a.x1 - a.x0) * (a.y1 - a.y0) + (b.x1 - b.x0) * (b.y1 - b.y0) - intersection;
             float iou = fmaxf(intersection / total_area, 0);
 
-            if (iou > nms_threshold){
+            if (iou > nms_threshold)
+            {
                 suppressed[j] = 1;
                 num_outputs--;
-            } else{
+            }
+            else
+            {
                 suppressed[j] = 0;
             }
         }
@@ -130,54 +137,62 @@ int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_thre
     return num_outputs;
 }
 
-
-float* arange(int start, int end, float stride) {
-    int length = (int) ((float) ceilf((float) (end - start) / stride));
+float* arange(int start, int end, float stride)
+{
+    int length = (int)((float)ceilf((float)(end - start) / stride));
     float* result = malloc(length * sizeof(float));
 
-    result[0] = (float) start;
-    for (int i = 1; i < length; i++) {
+    result[0] = (float)start;
+    for (int i = 1; i < length; i++)
+    {
         result[i] = result[i - 1] + stride;
     }
     return result;
 }
 
-
 void tile(const float* arr, int arr_length, int times, float offset,
-            float* result, int arr_starts_from, int arr_stride) {
+          float* result, int arr_starts_from, int arr_stride)
+{
     int length = arr_length * times;
 
-    if (result == NULL) {
+    if (result == NULL)
+    {
         result = malloc(length * sizeof(float));
         arr_starts_from = 0;
     }
 
-    for (int i = 0, j = 0; i < length; i++, j += arr_stride) {
+    for (int i = 0, j = 0; i < length; i++, j += arr_stride)
+    {
         result[j + arr_starts_from] = arr[i % arr_length] + offset;
     }
 }
 
 void repeat(const float* arr, int arr_length, int times, float offset,
-              float* result, int arr_starts_from, int arr_stride) {
+            float* result, int arr_starts_from, int arr_stride)
+{
     int length = arr_length * times;
 
-    if (result == NULL) {
+    if (result == NULL)
+    {
         result = malloc(length * sizeof(float));
         arr_starts_from = 0;
     }
 
-    for (int i = 0, j = 0; i < length; i++, j += arr_stride) {
+    for (int i = 0, j = 0; i < length; i++, j += arr_stride)
+    {
         result[j + arr_starts_from] = arr[i / times] + offset;
     }
 }
 
-
-int argmax(const float* arr, int arr_starts_from, int arr_length) {
+int argmax(const float* arr, int arr_starts_from, int arr_length)
+{
     float max_value = arr[arr_starts_from];
     int max_idx = 0;
-    for (int i = 1; i < arr_length; i++) {
+    for (int i = 1; i < arr_length; i++)
+    {
         float this_value = arr[arr_starts_from + i];
-        if (this_value > max_value) {
+        if (this_value > max_value)
+        {
             max_value = this_value;
             max_idx = i;
         }
@@ -185,28 +200,27 @@ int argmax(const float* arr, int arr_starts_from, int arr_length) {
     return max_idx;
 }
 
-
 int tengine_detect(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean,
-                     const float* scale, int loop_count, int num_thread, int affinity)
+                   const float* scale, int loop_count, int num_thread, int affinity)
 {
     /* setup network */
     const char* CLASSES_NAME[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-                                 "fire hydrant", "", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
-                                 "cow", "elephant", "bear", "zebra", "giraffe", "", "backpack", "umbrella", "", "", "handbag", "tie",
-                                 "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
-                                 "skateboard", "surfboard", "tennis racket", "bottle", "", "wine glass", "cup", "fork", "knife", "spoon",
-                                 "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut",
-                                 "cake", "chair", "couch", "potted plant", "bed", "", "dining table", "", "", "toilet", "", "tv",
-                                 "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
-                                 "refrigerator", "", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
-                                 "toothbrush"};
+                                  "fire hydrant", "", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
+                                  "cow", "elephant", "bear", "zebra", "giraffe", "", "backpack", "umbrella", "", "", "handbag", "tie",
+                                  "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
+                                  "skateboard", "surfboard", "tennis racket", "bottle", "", "wine glass", "cup", "fork", "knife", "spoon",
+                                  "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut",
+                                  "cake", "chair", "couch", "potted plant", "bed", "", "dining table", "", "", "toilet", "", "tv",
+                                  "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
+                                  "refrigerator", "", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
+                                  "toothbrush"};
 
     int PYRAMID_LEVELS[] = {3, 4, 5, 6, 7};
     int STRIDES[] = {8, 16, 32, 64, 128};
     float SCALES[] = {
-                (float) pow(2, 0.),
-                (float) pow(2, 1. / 3.),
-                (float) pow(2, 2. / 3.),
+        (float)pow(2, 0.),
+        (float)pow(2, 1. / 3.),
+        (float)pow(2, 2. / 3.),
     };
     float RATIOS_X[] = {1.f, 1.4f, 0.7f};
     float RATIOS_Y[] = {1.f, 0.7f, 1.4f};
@@ -243,8 +257,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -285,16 +299,19 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     int resized_h, resized_w;
     float resize_scale;
     image resImg;
-    if (raw_h > raw_w){
+    if (raw_h > raw_w)
+    {
         resized_h = img_h;
-        resized_w = (int) ((float) img_h / raw_h * raw_w);
+        resized_w = (int)((float)img_h / raw_h * raw_w);
         resImg = resize_image(im, resized_w, img_h);
-        resize_scale = (float) raw_h / img_h;
-    } else{
+        resize_scale = (float)raw_h / img_h;
+    }
+    else
+    {
         resized_w = img_w;
-        resized_h = (int) ((float) img_w / raw_w * raw_h);
+        resized_h = (int)((float)img_w / raw_w * raw_h);
         resImg = resize_image(im, img_w, resized_h);
-        resize_scale = (float) raw_w / img_w;
+        resize_scale = (float)raw_w / img_w;
     }
     free_image(im);
 
@@ -334,11 +351,11 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
 
     /* get the result of classification */
     tensor_t output_tensor_regression = get_graph_output_tensor(graph, 0, 0);
-    float* output_data_regression = ( float* )get_tensor_buffer(output_tensor_regression);
+    float* output_data_regression = (float*)get_tensor_buffer(output_tensor_regression);
     int num_anchors = get_tensor_buffer_size(output_tensor_regression) / sizeof(float) / 4;
 
     tensor_t output_tensor_classification = get_graph_output_tensor(graph, 1, 0);
-    float* output_data_classification = ( float* )get_tensor_buffer(output_tensor_classification);
+    float* output_data_classification = (float*)get_tensor_buffer(output_tensor_classification);
     int num_classes = get_tensor_buffer_size(output_tensor_classification) / sizeof(float) / num_anchors;
 
     // postprocess
@@ -349,21 +366,24 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     float* anchors_y1 = malloc(num_anchors * sizeof(float));
 
     int anchor_idx = 0;
-    for (int stride_idx = 0; stride_idx < num_levels; stride_idx++) {
+    for (int stride_idx = 0; stride_idx < num_levels; stride_idx++)
+    {
         int stride = STRIDES[stride_idx];
-        float arange_stride = powf(2, (float) PYRAMID_LEVELS[stride_idx]);
-        int length_x = (int) ceilf(((float) img_w - (float) stride / 2) / (float) arange_stride);
-        int length_y = (int) ceilf(((float) img_h - (float) stride / 2) / (float) arange_stride);
+        float arange_stride = powf(2, (float)PYRAMID_LEVELS[stride_idx]);
+        int length_x = (int)ceilf(((float)img_w - (float)stride / 2) / (float)arange_stride);
+        int length_y = (int)ceilf(((float)img_h - (float)stride / 2) / (float)arange_stride);
         float* x = arange(stride / 2, img_w, arange_stride);
         float* y = arange(stride / 2, img_h, arange_stride);
 
         int start_idx = anchor_idx;
         int num_anchor_types = num_scales * num_ratios;
-        for (int i = 0; i < num_scales; i++) {
+        for (int i = 0; i < num_scales; i++)
+        {
             float anchor_scale = SCALES[i];
-            float base_anchor_size = ANCHOR_SCALE * (float) stride * anchor_scale;
+            float base_anchor_size = ANCHOR_SCALE * (float)stride * anchor_scale;
 
-            for (int j = 0; j < num_ratios; j++) {
+            for (int j = 0; j < num_ratios; j++)
+            {
                 float ratio_x = RATIOS_X[j];
                 float ratio_y = RATIOS_Y[j];
 
@@ -391,14 +411,16 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     int num_proposals_over_threshold = 0;
 
 #pragma omp parallel for num_threads(opt.num_thread)
-    for (int i = 0; i < num_anchors; i++) {
+    for (int i = 0; i < num_anchors; i++)
+    {
         // loop over anchors
 
         // confidence
         int max_idx = argmax(output_data_classification, i * num_classes, num_classes);
         float max_score = output_data_classification[i * num_classes + max_idx];
 
-        if (isinf(max_score) || max_score < CONFIDENCE_THRESHOLD){
+        if (isinf(max_score) || max_score < CONFIDENCE_THRESHOLD)
+        {
             proposals[i].class_idx = -1;
             continue;
         }
@@ -429,24 +451,25 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
         xmax *= resize_scale;
 
         // clipping
-        xmin = fmaxf(fminf(xmin, (float) (raw_w - 1)), 0.f);
-        xmax = fmaxf(fminf(xmax, (float) (raw_w - 1)), 0.f);
-        ymin = fmaxf(fminf(ymin, (float) (raw_h - 1)), 0.f);
-        ymax = fmaxf(fminf(ymax, (float) (raw_h - 1)), 0.f);
+        xmin = fmaxf(fminf(xmin, (float)(raw_w - 1)), 0.f);
+        xmax = fmaxf(fminf(xmax, (float)(raw_w - 1)), 0.f);
+        ymin = fmaxf(fminf(ymin, (float)(raw_h - 1)), 0.f);
+        ymax = fmaxf(fminf(ymax, (float)(raw_h - 1)), 0.f);
 
         // area filtering
         float area = (xmax - xmin) * (ymax - ymin);
-        if (area < 4){
+        if (area < 4)
+        {
             proposals[i].class_idx = -1;
             continue;
         }
 
         num_proposals_over_threshold++;
 
-        proposals[i].x0 = (int) xmin;
-        proposals[i].x1 = (int) xmax;
-        proposals[i].y0 = (int) ymin;
-        proposals[i].y1 = (int) ymax;
+        proposals[i].x0 = (int)xmin;
+        proposals[i].x1 = (int)xmax;
+        proposals[i].y0 = (int)ymin;
+        proposals[i].y1 = (int)ymax;
     }
     free(anchors_x0);
     free(anchors_x1);
@@ -456,16 +479,18 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     // filter boxes with confidence threshold
     Box_t* proposals_over_threshold = malloc(sizeof(Box_t) * num_proposals_over_threshold);
     int proposals_over_threshold_idx = 0;
-    for (int i = 0; i < num_anchors; i++) {
+    for (int i = 0; i < num_anchors; i++)
+    {
         Box_t box = proposals[i];
-        if(box.class_idx == -1)
+        if (box.class_idx == -1)
             continue;
         proposals_over_threshold[proposals_over_threshold_idx] = box;
         proposals_over_threshold_idx++;
     }
     free(proposals);
 
-    if (num_proposals_over_threshold > 0){
+    if (num_proposals_over_threshold > 0)
+    {
         // sort boxes
         qsort_descent_inplace(proposals_over_threshold, 0, num_proposals_over_threshold - 1);
 
@@ -474,9 +499,10 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
         int num_outputs = nms(proposals_over_threshold, num_proposals_over_threshold, suppressed, NMS_THRESHOLD);
         Box_t* proposals_after_nms = malloc(num_outputs * sizeof(Box_t));
         int proposals_after_nms_idx = 0;
-        for(int i = 0; i < num_proposals_over_threshold; i++){
+        for (int i = 0; i < num_proposals_over_threshold; i++)
+        {
             Box_t box = proposals_over_threshold[i];
-            if(suppressed[i] == 1)
+            if (suppressed[i] == 1)
                 continue;
             proposals_after_nms[proposals_after_nms_idx] = box;
             proposals_after_nms_idx++;
@@ -536,37 +562,37 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_efficientdet_uint8.c b/examples/tm_efficientdet_uint8.c
index f25aa64c5..cc61bfea0 100644
--- a/examples/tm_efficientdet_uint8.c
+++ b/examples/tm_efficientdet_uint8.c
@@ -28,19 +28,18 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 512
-#define DEFAULT_IMG_W 512
-#define DEFAULT_SCALE1 0.017124754f
-#define DEFAULT_SCALE2 0.017507003f
-#define DEFAULT_SCALE3 0.017429194f
-#define DEFAULT_MEAN1 123.675
-#define DEFAULT_MEAN2 116.280
-#define DEFAULT_MEAN3 103.530
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        512
+#define DEFAULT_IMG_W        512
+#define DEFAULT_SCALE1       0.017124754f
+#define DEFAULT_SCALE2       0.017507003f
+#define DEFAULT_SCALE3       0.017429194f
+#define DEFAULT_MEAN1        123.675
+#define DEFAULT_MEAN2        116.280
+#define DEFAULT_MEAN3        103.530
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 #define DEFAULT_CPU_AFFINITY 255
 
-
 typedef struct Box
 {
     int x0;
@@ -51,20 +50,22 @@ typedef struct Box
     float score;
 } Box_t;
 
-
-void qsort_descent_inplace(Box_t* boxes, int left, int right) {
+void qsort_descent_inplace(Box_t* boxes, int left, int right)
+{
     int i = left;
     int j = right;
     float p = boxes[(left + right) / 2].score;
 
-    while (i <= j) {
+    while (i <= j)
+    {
         while (boxes[i].score > p)
             i++;
 
         while (boxes[j].score < p)
             j--;
 
-        if (i <= j) {
+        if (i <= j)
+        {
             // swap
             Box_t tmp = boxes[i];
             boxes[i] = boxes[j];
@@ -88,23 +89,26 @@ void qsort_descent_inplace(Box_t* boxes, int left, int right) {
     }
 }
 
-
-int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_threshold) {
+int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_threshold)
+{
     int num_outputs = num_boxes;
 
     float* areas = malloc(num_boxes * sizeof(float));
 
-    for (int i = 0; i < num_boxes; i++) {
-        areas[i] = (float) ((boxes[i].x1 - boxes[i].x0) * (boxes[i].y1 - boxes[i].y0));
+    for (int i = 0; i < num_boxes; i++)
+    {
+        areas[i] = (float)((boxes[i].x1 - boxes[i].x0) * (boxes[i].y1 - boxes[i].y0));
     }
 
-    for (int i = 0; i < num_boxes; i++) {
+    for (int i = 0; i < num_boxes; i++)
+    {
         const Box_t a = boxes[i];
 
         if (suppressed[i] == 1)
             continue;
 
-        for (int j = i + 1; j < num_boxes; j++) {
+        for (int j = i + 1; j < num_boxes; j++)
+        {
             const Box_t b = boxes[j];
 
             if (suppressed[j] == 1)
@@ -115,10 +119,13 @@ int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_thre
             float total_area = (a.x1 - a.x0) * (a.y1 - a.y0) + (b.x1 - b.x0) * (b.y1 - b.y0) - intersection;
             float iou = fmaxf(intersection / total_area, 0);
 
-            if (iou > nms_threshold){
+            if (iou > nms_threshold)
+            {
                 suppressed[j] = 1;
                 num_outputs--;
-            } else{
+            }
+            else
+            {
                 suppressed[j] = 0;
             }
         }
@@ -128,54 +135,62 @@ int nms(const Box_t* boxes, const int num_boxes, int* suppressed, float nms_thre
     return num_outputs;
 }
 
-
-float* arange(int start, int end, float stride) {
-    int length = (int) ((float) ceilf((float) (end - start) / stride));
+float* arange(int start, int end, float stride)
+{
+    int length = (int)((float)ceilf((float)(end - start) / stride));
     float* result = malloc(length * sizeof(float));
 
-    result[0] = (float) start;
-    for (int i = 1; i < length; i++) {
+    result[0] = (float)start;
+    for (int i = 1; i < length; i++)
+    {
         result[i] = result[i - 1] + stride;
     }
     return result;
 }
 
-
 void tile(const float* arr, int arr_length, int times, float offset,
-            float* result, int arr_starts_from, int arr_stride) {
+          float* result, int arr_starts_from, int arr_stride)
+{
     int length = arr_length * times;
 
-    if (result == NULL) {
+    if (result == NULL)
+    {
         result = malloc(length * sizeof(float));
         arr_starts_from = 0;
     }
 
-    for (int i = 0, j = 0; i < length; i++, j += arr_stride) {
+    for (int i = 0, j = 0; i < length; i++, j += arr_stride)
+    {
         result[j + arr_starts_from] = arr[i % arr_length] + offset;
     }
 }
 
 void repeat(const float* arr, int arr_length, int times, float offset,
-              float* result, int arr_starts_from, int arr_stride) {
+            float* result, int arr_starts_from, int arr_stride)
+{
     int length = arr_length * times;
 
-    if (result == NULL) {
+    if (result == NULL)
+    {
         result = malloc(length * sizeof(float));
         arr_starts_from = 0;
     }
 
-    for (int i = 0, j = 0; i < length; i++, j += arr_stride) {
+    for (int i = 0, j = 0; i < length; i++, j += arr_stride)
+    {
         result[j + arr_starts_from] = arr[i / times] + offset;
     }
 }
 
-
-int argmax(const float* arr, int arr_starts_from, int arr_length) {
+int argmax(const float* arr, int arr_starts_from, int arr_length)
+{
     float max_value = arr[arr_starts_from];
     int max_idx = 0;
-    for (int i = 1; i < arr_length; i++) {
+    for (int i = 1; i < arr_length; i++)
+    {
         float this_value = arr[arr_starts_from + i];
-        if (this_value > max_value) {
+        if (this_value > max_value)
+        {
             max_value = this_value;
             max_idx = i;
         }
@@ -183,28 +198,27 @@ int argmax(const float* arr, int arr_starts_from, int arr_length) {
     return max_idx;
 }
 
-
 int tengine_detect(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean,
-                     const float* scale, int loop_count, int num_thread, int affinity)
+                   const float* scale, int loop_count, int num_thread, int affinity)
 {
     /* setup network */
     const char* CLASSES_NAME[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-                                 "fire hydrant", "", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
-                                 "cow", "elephant", "bear", "zebra", "giraffe", "", "backpack", "umbrella", "", "", "handbag", "tie",
-                                 "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
-                                 "skateboard", "surfboard", "tennis racket", "bottle", "", "wine glass", "cup", "fork", "knife", "spoon",
-                                 "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut",
-                                 "cake", "chair", "couch", "potted plant", "bed", "", "dining table", "", "", "toilet", "", "tv",
-                                 "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
-                                 "refrigerator", "", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
-                                 "toothbrush"};
+                                  "fire hydrant", "", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
+                                  "cow", "elephant", "bear", "zebra", "giraffe", "", "backpack", "umbrella", "", "", "handbag", "tie",
+                                  "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
+                                  "skateboard", "surfboard", "tennis racket", "bottle", "", "wine glass", "cup", "fork", "knife", "spoon",
+                                  "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut",
+                                  "cake", "chair", "couch", "potted plant", "bed", "", "dining table", "", "", "toilet", "", "tv",
+                                  "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
+                                  "refrigerator", "", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
+                                  "toothbrush"};
 
     int PYRAMID_LEVELS[] = {3, 4, 5, 6, 7};
     int STRIDES[] = {8, 16, 32, 64, 128};
     float SCALES[] = {
-                (float) pow(2, 0.),
-                (float) pow(2, 1. / 3.),
-                (float) pow(2, 2. / 3.),
+        (float)pow(2, 0.),
+        (float)pow(2, 1. / 3.),
+        (float)pow(2, 2. / 3.),
     };
     float RATIOS_X[] = {1.f, 1.4f, 0.7f};
     float RATIOS_Y[] = {1.f, 0.7f, 1.4f};
@@ -241,8 +255,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    uint8_t* input_data = ( uint8_t* )malloc(img_size * sizeof(uint8_t));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    uint8_t* input_data = (uint8_t*)malloc(img_size * sizeof(uint8_t));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -283,16 +297,19 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     int resized_h, resized_w;
     float resize_scale;
     image resImg;
-    if (raw_h > raw_w){
+    if (raw_h > raw_w)
+    {
         resized_h = img_h;
-        resized_w = (int) ((float) img_h / raw_h * raw_w);
+        resized_w = (int)((float)img_h / raw_h * raw_w);
         resImg = resize_image(im, resized_w, img_h);
-        resize_scale = (float) raw_h / img_h;
-    } else{
+        resize_scale = (float)raw_h / img_h;
+    }
+    else
+    {
         resized_w = img_w;
-        resized_h = (int) ((float) img_w / raw_w * raw_h);
+        resized_h = (int)((float)img_w / raw_w * raw_h);
         resImg = resize_image(im, img_w, resized_h);
-        resize_scale = (float) raw_w / img_w;
+        resize_scale = (float)raw_w / img_w;
     }
     free_image(im);
 
@@ -347,12 +364,12 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
 
     /* get the result of classification */
     tensor_t output_tensor_regression = get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_data_regression_u8 = ( uint8_t* )get_tensor_buffer(output_tensor_regression);
+    uint8_t* output_data_regression_u8 = (uint8_t*)get_tensor_buffer(output_tensor_regression);
     int num_anchors_data = get_tensor_buffer_size(output_tensor_regression);
     int num_anchors = get_tensor_buffer_size(output_tensor_regression) / 4;
 
     tensor_t output_tensor_classification = get_graph_output_tensor(graph, 1, 0);
-    uint8_t* output_data_classification_u8 = ( uint8_t* )get_tensor_buffer(output_tensor_classification);
+    uint8_t* output_data_classification_u8 = (uint8_t*)get_tensor_buffer(output_tensor_classification);
     int num_classes_data = get_tensor_buffer_size(output_tensor_classification);
     int num_classes = get_tensor_buffer_size(output_tensor_classification) / num_anchors;
 
@@ -360,16 +377,16 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     float output_scale_regression = 0.f;
     int output_zero_point_regression = 0;
     get_tensor_quant_param(output_tensor_regression, &output_scale_regression, &output_zero_point_regression, 1);
-    float* output_data_regression = ( float* )malloc(num_anchors_data * sizeof(float));
+    float* output_data_regression = (float*)malloc(num_anchors_data * sizeof(float));
     for (int i = 0; i < num_anchors_data; i++)
-        output_data_regression[i] = (( float )output_data_regression_u8[i] - ( float )output_zero_point_regression) * output_scale_regression;
+        output_data_regression[i] = ((float)output_data_regression_u8[i] - (float)output_zero_point_regression) * output_scale_regression;
 
     float output_scale_classification = 0.f;
     int output_zero_point_classification = 0;
     get_tensor_quant_param(output_tensor_classification, &output_scale_classification, &output_zero_point_classification, 1);
-    float* output_data_classification = ( float* )malloc(num_classes_data * sizeof(float));
+    float* output_data_classification = (float*)malloc(num_classes_data * sizeof(float));
     for (int i = 0; i < num_classes_data; i++)
-        output_data_classification[i] = (( float )output_data_classification_u8[i] - ( float )output_zero_point_classification) * output_scale_classification;            
+        output_data_classification[i] = ((float)output_data_classification_u8[i] - (float)output_zero_point_classification) * output_scale_classification;
 
     // postprocess
     // generate anchors
@@ -379,21 +396,24 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     float* anchors_y1 = malloc(num_anchors * sizeof(float));
 
     int anchor_idx = 0;
-    for (int stride_idx = 0; stride_idx < num_levels; stride_idx++) {
+    for (int stride_idx = 0; stride_idx < num_levels; stride_idx++)
+    {
         int stride = STRIDES[stride_idx];
-        float arange_stride = powf(2, (float) PYRAMID_LEVELS[stride_idx]);
-        int length_x = (int) ceilf(((float) img_w - (float) stride / 2) / (float) arange_stride);
-        int length_y = (int) ceilf(((float) img_h - (float) stride / 2) / (float) arange_stride);
+        float arange_stride = powf(2, (float)PYRAMID_LEVELS[stride_idx]);
+        int length_x = (int)ceilf(((float)img_w - (float)stride / 2) / (float)arange_stride);
+        int length_y = (int)ceilf(((float)img_h - (float)stride / 2) / (float)arange_stride);
         float* x = arange(stride / 2, img_w, arange_stride);
         float* y = arange(stride / 2, img_h, arange_stride);
 
         int start_idx = anchor_idx;
         int num_anchor_types = num_scales * num_ratios;
-        for (int i = 0; i < num_scales; i++) {
+        for (int i = 0; i < num_scales; i++)
+        {
             float anchor_scale = SCALES[i];
-            float base_anchor_size = ANCHOR_SCALE * (float) stride * anchor_scale;
+            float base_anchor_size = ANCHOR_SCALE * (float)stride * anchor_scale;
 
-            for (int j = 0; j < num_ratios; j++) {
+            for (int j = 0; j < num_ratios; j++)
+            {
                 float ratio_x = RATIOS_X[j];
                 float ratio_y = RATIOS_Y[j];
 
@@ -421,14 +441,16 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     int num_proposals_over_threshold = 0;
 
 #pragma omp parallel for num_threads(opt.num_thread)
-    for (int i = 0; i < num_anchors; i++) {
+    for (int i = 0; i < num_anchors; i++)
+    {
         // loop over anchors
 
         // confidence
         int max_idx = argmax(output_data_classification, i * num_classes, num_classes);
         float max_score = output_data_classification[i * num_classes + max_idx];
 
-        if (isinf(max_score) || max_score < CONFIDENCE_THRESHOLD){
+        if (isinf(max_score) || max_score < CONFIDENCE_THRESHOLD)
+        {
             proposals[i].class_idx = -1;
             continue;
         }
@@ -459,24 +481,25 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
         xmax *= resize_scale;
 
         // clipping
-        xmin = fmaxf(fminf(xmin, (float) (raw_w - 1)), 0.f);
-        xmax = fmaxf(fminf(xmax, (float) (raw_w - 1)), 0.f);
-        ymin = fmaxf(fminf(ymin, (float) (raw_h - 1)), 0.f);
-        ymax = fmaxf(fminf(ymax, (float) (raw_h - 1)), 0.f);
+        xmin = fmaxf(fminf(xmin, (float)(raw_w - 1)), 0.f);
+        xmax = fmaxf(fminf(xmax, (float)(raw_w - 1)), 0.f);
+        ymin = fmaxf(fminf(ymin, (float)(raw_h - 1)), 0.f);
+        ymax = fmaxf(fminf(ymax, (float)(raw_h - 1)), 0.f);
 
         // area filtering
         float area = (xmax - xmin) * (ymax - ymin);
-        if (area < 4){
+        if (area < 4)
+        {
             proposals[i].class_idx = -1;
             continue;
         }
 
         num_proposals_over_threshold++;
 
-        proposals[i].x0 = (int) xmin;
-        proposals[i].x1 = (int) xmax;
-        proposals[i].y0 = (int) ymin;
-        proposals[i].y1 = (int) ymax;
+        proposals[i].x0 = (int)xmin;
+        proposals[i].x1 = (int)xmax;
+        proposals[i].y0 = (int)ymin;
+        proposals[i].y1 = (int)ymax;
     }
     free(anchors_x0);
     free(anchors_x1);
@@ -486,16 +509,18 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     // filter boxes wiht confidence threshold
     Box_t* proposals_over_threshold = malloc(sizeof(Box_t) * num_proposals_over_threshold);
     int proposals_over_threshold_idx = 0;
-    for (int i = 0; i < num_anchors; i++) {
+    for (int i = 0; i < num_anchors; i++)
+    {
         Box_t box = proposals[i];
-        if(box.class_idx == -1)
+        if (box.class_idx == -1)
             continue;
         proposals_over_threshold[proposals_over_threshold_idx] = box;
         proposals_over_threshold_idx++;
     }
     free(proposals);
 
-    if (num_proposals_over_threshold > 0){
+    if (num_proposals_over_threshold > 0)
+    {
         // sort boxes
         qsort_descent_inplace(proposals_over_threshold, 0, num_proposals_over_threshold - 1);
 
@@ -504,9 +529,10 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
         int num_outputs = nms(proposals_over_threshold, num_proposals_over_threshold, suppressed, NMS_THRESHOLD);
         Box_t* proposals_after_nms = malloc(num_outputs * sizeof(Box_t));
         int proposals_after_nms_idx = 0;
-        for(int i = 0; i < num_proposals_over_threshold; i++){
+        for (int i = 0; i < num_proposals_over_threshold; i++)
+        {
             Box_t box = proposals_over_threshold[i];
-            if(suppressed[i] == 1)
+            if (suppressed[i] == 1)
                 continue;
             proposals_after_nms[proposals_after_nms_idx] = box;
             proposals_after_nms_idx++;
@@ -568,37 +594,37 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_hrnet.cpp b/examples/tm_hrnet.cpp
index c81b0b3d4..bb26f404b 100644
--- a/examples/tm_hrnet.cpp
+++ b/examples/tm_hrnet.cpp
@@ -39,58 +39,60 @@
 
 #define DEFAULT_REPEAT_COUNT 1
 #define DEFAULT_THREAD_COUNT 1
-#define LETTERBOX_ROWS 256
-#define LETTERBOX_COLS 256
-#define MODEL_CHANNELS 3
-#define HEATMAP_CHANNEL 16
+#define LETTERBOX_ROWS       256
+#define LETTERBOX_COLS       256
+#define MODEL_CHANNELS       3
+#define HEATMAP_CHANNEL      16
 
-typedef struct {
+typedef struct
+{
     float x;
     float y;
     float score;
 } ai_point_t;
 
-struct skeleton {
+struct skeleton
+{
     int connection[2];
     int left_right_neutral;
 };
 
-std::vector<skeleton> pairs = {{0,  1,  0},
-                               {1,  2,  0},
-                               {3,  4,  1},
-                               {4,  5,  1},
-                               {2,  6,  0},
-                               {3,  6,  1},
-                               {6,  7,  2},
-                               {7,  8,  2},
-                               {8,  9,  2},
-                               {13, 7,  1},
+std::vector<skeleton> pairs = {{0, 1, 0},
+                               {1, 2, 0},
+                               {3, 4, 1},
+                               {4, 5, 1},
+                               {2, 6, 0},
+                               {3, 6, 1},
+                               {6, 7, 2},
+                               {7, 8, 2},
+                               {8, 9, 2},
+                               {13, 7, 1},
                                {10, 11, 0},
-                               {7,  12, 0},
+                               {7, 12, 0},
                                {12, 11, 0},
                                {13, 14, 1},
                                {14, 15, 1}};
 
-
-typedef struct {
+typedef struct
+{
     std::vector<ai_point_t> keypoints;
     int32_t img_width = 0;
     int32_t img_heigh = 0;
     uint64_t timestamp = 0;
 } ai_body_parts_s;
 
-void FindMax2D(float *buf, int width, int height, int *max_idx_width, int *max_idx_height, float *max_value, int c) 
+void FindMax2D(float* buf, int width, int height, int* max_idx_width, int* max_idx_height, float* max_value, int c)
 {
-    float *ptr = buf;
+    float* ptr = buf;
     *max_value = -10.f;
     *max_idx_width = 0;
     *max_idx_height = 0;
-    for (int h = 0; h < height; h++) 
+    for (int h = 0; h < height; h++)
     {
-        for (int w = 0; w < width; w++) 
+        for (int w = 0; w < width; w++)
         {
             float score = ptr[c * height * width + h * height + w];
-            if (score > *max_value) 
+            if (score > *max_value)
             {
                 *max_value = score;
                 *max_idx_height = h;
@@ -100,7 +102,7 @@ void FindMax2D(float *buf, int width, int height, int *max_idx_width, int *max_i
     }
 }
 
-void PostProcess(float *data, ai_body_parts_s &pose, int img_h, int img_w) 
+void PostProcess(float* data, ai_body_parts_s& pose, int img_h, int img_w)
 {
     int heatmap_width = img_w / 4;
     int heatmap_height = img_h / 4;
@@ -108,21 +110,20 @@ void PostProcess(float *data, ai_body_parts_s &pose, int img_h, int img_w)
     float max_score;
 
     ai_point_t kp;
-    for (int c = 0; c < HEATMAP_CHANNEL; ++c) 
+    for (int c = 0; c < HEATMAP_CHANNEL; ++c)
     {
         FindMax2D(data, heatmap_width, heatmap_height, &max_idx_width, &max_idx_height, &max_score, c);
-        kp.x = (float) max_idx_width / (float) heatmap_width;
-        kp.y = (float) max_idx_height / (float) heatmap_height;
+        kp.x = (float)max_idx_width / (float)heatmap_width;
+        kp.y = (float)max_idx_height / (float)heatmap_height;
         kp.score = max_score;
         pose.keypoints.push_back(kp);
 
         std::cout << "x: " << pose.keypoints[c].x * 64 << ", y: " << pose.keypoints[c].y * 64 << ", score: "
                   << pose.keypoints[c].score << std::endl;
-
     }
 }
 
-void draw_result(cv::Mat img, ai_body_parts_s &pose)
+void draw_result(cv::Mat img, ai_body_parts_s& pose)
 {
     /* recover process to draw */
     float scale_letterbox;
@@ -145,8 +146,8 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose)
 
     for (int i = 0; i < HEATMAP_CHANNEL; i++)
     {
-        int x = (int) ((pose.keypoints[i].x * LETTERBOX_COLS - tmp_w) * ratio_x);
-        int y = (int) ((pose.keypoints[i].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
+        int x = (int)((pose.keypoints[i].x * LETTERBOX_COLS - tmp_w) * ratio_x);
+        int y = (int)((pose.keypoints[i].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
 
         x = std::max(std::min(x, (img.cols - 1)), 0);
         y = std::max(std::min(y, (img.rows - 1)), 0);
@@ -157,24 +158,24 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose)
     cv::Scalar color;
     cv::Point pt1;
     cv::Point pt2;
-    for (auto &element: pairs)
+    for (auto& element : pairs)
     {
-        switch(element.left_right_neutral)
+        switch (element.left_right_neutral)
         {
-            case 0:
-                color = cv::Scalar(255, 0, 0);
-                break;
-            case 1:
-                color = cv::Scalar(0, 0, 255);
-                break;
-            default:
-                color = cv::Scalar(0, 255, 0);
+        case 0:
+            color = cv::Scalar(255, 0, 0);
+            break;
+        case 1:
+            color = cv::Scalar(0, 0, 255);
+            break;
+        default:
+            color = cv::Scalar(0, 255, 0);
         }
 
-        int x1 = (int) ((pose.keypoints[element.connection[0]].x * LETTERBOX_COLS - tmp_w) * ratio_x);
-        int y1 = (int) ((pose.keypoints[element.connection[0]].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
-        int x2 = (int) ((pose.keypoints[element.connection[1]].x * LETTERBOX_COLS - tmp_w) * ratio_x);
-        int y2 = (int) ((pose.keypoints[element.connection[1]].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
+        int x1 = (int)((pose.keypoints[element.connection[0]].x * LETTERBOX_COLS - tmp_w) * ratio_x);
+        int y1 = (int)((pose.keypoints[element.connection[0]].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
+        int x2 = (int)((pose.keypoints[element.connection[1]].x * LETTERBOX_COLS - tmp_w) * ratio_x);
+        int y2 = (int)((pose.keypoints[element.connection[1]].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
 
         x1 = std::max(std::min(x1, (img.cols - 1)), 0);
         y1 = std::max(std::min(y1, (img.rows - 1)), 0);
@@ -187,7 +188,7 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose)
     }
 }
 
-void get_input_fp32_data_square(const char *image_file, float *input_data, float *mean, float *scale) 
+void get_input_fp32_data_square(const char* image_file, float* input_data, float* mean, float* scale)
 {
     cv::Mat img = cv::imread(image_file);
 
@@ -215,15 +216,15 @@ void get_input_fp32_data_square(const char *image_file, float *input_data, float
     int right = (LETTERBOX_COLS - resize_cols + 1) / 2;
     // Letterbox filling
     cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
-//    cv::imwrite("hrnet_lb_image.jpg", img_new); // for letterbox test
-    float *img_data = (float *) img_new.data;
+    //    cv::imwrite("hrnet_lb_image.jpg", img_new); // for letterbox test
+    float* img_data = (float*)img_new.data;
 
     /* nhwc to nchw */
-    for (int h = 0; h < LETTERBOX_ROWS; h++) 
+    for (int h = 0; h < LETTERBOX_ROWS; h++)
     {
-        for (int w = 0; w < LETTERBOX_COLS; w++) 
+        for (int w = 0; w < LETTERBOX_COLS; w++)
         {
-            for (int c = 0; c < MODEL_CHANNELS; c++) 
+            for (int c = 0; c < MODEL_CHANNELS; c++)
             {
                 int in_index = h * LETTERBOX_COLS * MODEL_CHANNELS + w * MODEL_CHANNELS + c;
                 int out_index = c * LETTERBOX_ROWS * LETTERBOX_COLS + h * LETTERBOX_COLS + w;
@@ -233,17 +234,17 @@ void get_input_fp32_data_square(const char *image_file, float *input_data, float
     }
 }
 
-void show_usage() 
+void show_usage()
 {
     fprintf(stderr, "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
 }
 
-int main(int argc, char *argv[]) 
+int main(int argc, char* argv[])
 {
     int repeat_count = DEFAULT_REPEAT_COUNT;
     int num_thread = DEFAULT_THREAD_COUNT;
-    char *model_file = nullptr;
-    char *image_file = nullptr;
+    char* model_file = nullptr;
+    char* image_file = nullptr;
     int img_h = LETTERBOX_COLS;
     int img_w = LETTERBOX_ROWS;
     ai_body_parts_s pose;
@@ -252,39 +253,39 @@ int main(int argc, char *argv[])
     float scale[3] = {0.017125f, 0.017507f, 0.017429f};
 
     int res;
-    while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1) 
+    while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1)
     {
-        switch (res) 
+        switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
     /* check files */
-    if (model_file == nullptr) 
+    if (model_file == nullptr)
     {
         fprintf(stderr, "Error: Tengine model file not specified!\n");
         show_usage();
         return -1;
     }
 
-    if (image_file == nullptr) 
+    if (image_file == nullptr)
     {
         fprintf(stderr, "Error: Image file not specified!\n");
         show_usage();
@@ -311,7 +312,7 @@ int main(int argc, char *argv[])
 
     /* create graph, load tengine model xxx.tmfile */
     graph_t graph = create_graph(nullptr, "tengine", model_file);
-    if (graph == nullptr) 
+    if (graph == nullptr)
     {
         fprintf(stderr, "Create graph failed.\n");
         return -1;
@@ -319,30 +320,30 @@ int main(int argc, char *argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
+    int dims[] = {1, 3, img_h, img_w}; // nchw
     std::vector<float> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
-    if (input_tensor == nullptr) 
+    if (input_tensor == nullptr)
     {
         fprintf(stderr, "Get input tensor failed\n");
         return -1;
     }
 
-    if (set_tensor_shape(input_tensor, dims, 4) < 0) 
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
     {
         fprintf(stderr, "Set input tensor shape failed\n");
         return -1;
     }
 
-    if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0) 
+    if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0)
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
     }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
-    if (prerun_graph_multithread(graph, opt) < 0) 
+    if (prerun_graph_multithread(graph, opt) < 0)
     {
         fprintf(stderr, "Prerun multithread graph failed.\n");
         return -1;
@@ -355,10 +356,10 @@ int main(int argc, char *argv[])
     double min_time = DBL_MAX;
     double max_time = DBL_MIN;
     double total_time = 0.;
-    for (int i = 0; i < repeat_count; i++) 
+    for (int i = 0; i < repeat_count; i++)
     {
         double start = get_current_time();
-        if (run_graph(graph, 1) < 0) 
+        if (run_graph(graph, 1) < 0)
         {
             fprintf(stderr, "Run graph failed\n");
             return -1;
@@ -370,11 +371,11 @@ int main(int argc, char *argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat [%d] min %.3f ms, max %.3f ms, avg %.3f ms\n", repeat_count, min_time, max_time,
-           total_time / repeat_count);
+            total_time / repeat_count);
 
     /* get output tensor */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float *data = (float *) (get_tensor_buffer(output_tensor));
+    float* data = (float*)(get_tensor_buffer(output_tensor));
 
     PostProcess(data, pose, img_h, img_w);
 
diff --git a/examples/tm_hrnet_timvx.cpp b/examples/tm_hrnet_timvx.cpp
index a677b05ea..296ce5b91 100644
--- a/examples/tm_hrnet_timvx.cpp
+++ b/examples/tm_hrnet_timvx.cpp
@@ -37,58 +37,60 @@
 
 #define DEFAULT_REPEAT_COUNT 1
 #define DEFAULT_THREAD_COUNT 1
-#define LETTERBOX_ROWS 256
-#define LETTERBOX_COLS 256
-#define MODEL_CHANNELS 3
-#define HEATMAP_CHANNEL 16
+#define LETTERBOX_ROWS       256
+#define LETTERBOX_COLS       256
+#define MODEL_CHANNELS       3
+#define HEATMAP_CHANNEL      16
 
-typedef struct {
+typedef struct
+{
     float x;
     float y;
     float score;
 } ai_point_t;
 
-struct skeleton {
+struct skeleton
+{
     int connection[2];
     int left_right_neutral;
 };
 
-std::vector<skeleton> pairs = {{0,  1,  0},
-                               {1,  2,  0},
-                               {3,  4,  1},
-                               {4,  5,  1},
-                               {2,  6,  0},
-                               {3,  6,  1},
-                               {6,  7,  2},
-                               {7,  8,  2},
-                               {8,  9,  2},
-                               {13, 7,  1},
+std::vector<skeleton> pairs = {{0, 1, 0},
+                               {1, 2, 0},
+                               {3, 4, 1},
+                               {4, 5, 1},
+                               {2, 6, 0},
+                               {3, 6, 1},
+                               {6, 7, 2},
+                               {7, 8, 2},
+                               {8, 9, 2},
+                               {13, 7, 1},
                                {10, 11, 0},
-                               {7,  12, 0},
+                               {7, 12, 0},
                                {12, 11, 0},
                                {13, 14, 1},
                                {14, 15, 1}};
 
-
-typedef struct {
+typedef struct
+{
     std::vector<ai_point_t> keypoints;
     int32_t img_width = 0;
     int32_t img_heigh = 0;
     uint64_t timestamp = 0;
 } ai_body_parts_s;
 
-void FindMax2D(float *buf, int width, int height, int *max_idx_width, int *max_idx_height, float *max_value, int c) 
+void FindMax2D(float* buf, int width, int height, int* max_idx_width, int* max_idx_height, float* max_value, int c)
 {
-    float *ptr = buf;
+    float* ptr = buf;
     *max_value = -10.f;
     *max_idx_width = 0;
     *max_idx_height = 0;
-    for (int h = 0; h < height; h++) 
+    for (int h = 0; h < height; h++)
     {
-        for (int w = 0; w < width; w++) 
+        for (int w = 0; w < width; w++)
         {
             float score = ptr[c * height * width + h * height + w];
-            if (score > *max_value) 
+            if (score > *max_value)
             {
                 *max_value = score;
                 *max_idx_height = h;
@@ -98,7 +100,7 @@ void FindMax2D(float *buf, int width, int height, int *max_idx_width, int *max_i
     }
 }
 
-void PostProcess(float *data, ai_body_parts_s &pose, int img_h, int img_w) 
+void PostProcess(float* data, ai_body_parts_s& pose, int img_h, int img_w)
 {
     int heatmap_width = img_w / 4;
     int heatmap_height = img_h / 4;
@@ -106,21 +108,20 @@ void PostProcess(float *data, ai_body_parts_s &pose, int img_h, int img_w)
     float max_score;
 
     ai_point_t kp;
-    for (int c = 0; c < HEATMAP_CHANNEL; ++c) 
+    for (int c = 0; c < HEATMAP_CHANNEL; ++c)
     {
         FindMax2D(data, heatmap_width, heatmap_height, &max_idx_width, &max_idx_height, &max_score, c);
-        kp.x = (float) max_idx_width / (float) heatmap_width;
-        kp.y = (float) max_idx_height / (float) heatmap_height;
+        kp.x = (float)max_idx_width / (float)heatmap_width;
+        kp.y = (float)max_idx_height / (float)heatmap_height;
         kp.score = max_score;
         pose.keypoints.push_back(kp);
 
         std::cout << "x: " << pose.keypoints[c].x * 64 << ", y: " << pose.keypoints[c].y * 64 << ", score: "
                   << pose.keypoints[c].score << std::endl;
-
     }
 }
 
-void draw_result(cv::Mat img, ai_body_parts_s &pose)
+void draw_result(cv::Mat img, ai_body_parts_s& pose)
 {
     /* recover process to draw */
     float scale_letterbox;
@@ -143,8 +144,8 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose)
 
     for (int i = 0; i < HEATMAP_CHANNEL; i++)
     {
-        int x = (int) ((pose.keypoints[i].x * LETTERBOX_COLS - tmp_w) * ratio_x);
-        int y = (int) ((pose.keypoints[i].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
+        int x = (int)((pose.keypoints[i].x * LETTERBOX_COLS - tmp_w) * ratio_x);
+        int y = (int)((pose.keypoints[i].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
 
         x = std::max(std::min(x, (img.cols - 1)), 0);
         y = std::max(std::min(y, (img.rows - 1)), 0);
@@ -155,24 +156,24 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose)
     cv::Scalar color;
     cv::Point pt1;
     cv::Point pt2;
-    for (auto &element: pairs)
+    for (auto& element : pairs)
     {
-        switch(element.left_right_neutral)
+        switch (element.left_right_neutral)
         {
-            case 0:
-                color = cv::Scalar(255, 0, 0);
-                break;
-            case 1:
-                color = cv::Scalar(0, 0, 255);
-                break;
-            default:
-                color = cv::Scalar(0, 255, 0);
+        case 0:
+            color = cv::Scalar(255, 0, 0);
+            break;
+        case 1:
+            color = cv::Scalar(0, 0, 255);
+            break;
+        default:
+            color = cv::Scalar(0, 255, 0);
         }
 
-        int x1 = (int) ((pose.keypoints[element.connection[0]].x * LETTERBOX_COLS - tmp_w) * ratio_x);
-        int y1 = (int) ((pose.keypoints[element.connection[0]].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
-        int x2 = (int) ((pose.keypoints[element.connection[1]].x * LETTERBOX_COLS - tmp_w) * ratio_x);
-        int y2 = (int) ((pose.keypoints[element.connection[1]].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
+        int x1 = (int)((pose.keypoints[element.connection[0]].x * LETTERBOX_COLS - tmp_w) * ratio_x);
+        int y1 = (int)((pose.keypoints[element.connection[0]].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
+        int x2 = (int)((pose.keypoints[element.connection[1]].x * LETTERBOX_COLS - tmp_w) * ratio_x);
+        int y2 = (int)((pose.keypoints[element.connection[1]].y * LETTERBOX_ROWS - tmp_h) * ratio_y);
 
         x1 = std::max(std::min(x1, (img.cols - 1)), 0);
         y1 = std::max(std::min(y1, (img.rows - 1)), 0);
@@ -185,8 +186,8 @@ void draw_result(cv::Mat img, ai_body_parts_s &pose)
     }
 }
 
-void get_input_uint8_data_square(const char *image_file, uint8_t *input_data, float *mean, float *scale,
-                                 float input_scale, int zero_point) 
+void get_input_uint8_data_square(const char* image_file, uint8_t* input_data, float* mean, float* scale,
+                                 float input_scale, int zero_point)
 {
     cv::Mat img = cv::imread(image_file);
 
@@ -214,21 +215,21 @@ void get_input_uint8_data_square(const char *image_file, uint8_t *input_data, fl
     int right = (LETTERBOX_COLS - resize_cols + 1) / 2;
     // Letterbox filling
     cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
-//    cv::imwrite("hrnet_lb_image.jpg", img_new); // for letterbox test
-    float *img_data = (float *) img_new.data;
+    //    cv::imwrite("hrnet_lb_image.jpg", img_new); // for letterbox test
+    float* img_data = (float*)img_new.data;
 
     /* nhwc to nchw */
-    for (int h = 0; h < LETTERBOX_ROWS; h++) 
-	{
-        for (int w = 0; w < LETTERBOX_COLS; w++) 
-		{
-            for (int c = 0; c < MODEL_CHANNELS; c++) 
-			{
+    for (int h = 0; h < LETTERBOX_ROWS; h++)
+    {
+        for (int w = 0; w < LETTERBOX_COLS; w++)
+        {
+            for (int c = 0; c < MODEL_CHANNELS; c++)
+            {
                 int in_index = h * LETTERBOX_COLS * MODEL_CHANNELS + w * MODEL_CHANNELS + c;
                 int out_index = c * LETTERBOX_ROWS * LETTERBOX_COLS + h * LETTERBOX_COLS + w;
                 float input_temp = (img_data[in_index] - mean[c]) * scale[c];
                 /* quant to uint8 */
-                int udata = (round)(input_temp  / input_scale + ( float )zero_point);
+                int udata = (round)(input_temp / input_scale + (float)zero_point);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -239,17 +240,17 @@ void get_input_uint8_data_square(const char *image_file, uint8_t *input_data, fl
     }
 }
 
-void show_usage() 
+void show_usage()
 {
     fprintf(stderr, "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
 }
 
-int main(int argc, char *argv[]) 
+int main(int argc, char* argv[])
 {
     int repeat_count = DEFAULT_REPEAT_COUNT;
     int num_thread = DEFAULT_THREAD_COUNT;
-    char *model_file = nullptr;
-    char *image_file = nullptr;
+    char* model_file = nullptr;
+    char* image_file = nullptr;
     int img_h = LETTERBOX_COLS;
     int img_w = LETTERBOX_ROWS;
     ai_body_parts_s pose;
@@ -258,40 +259,40 @@ int main(int argc, char *argv[])
     float scale[3] = {0.017125f, 0.017507f, 0.017429f};
 
     int res;
-    while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1) 
+    while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1)
     {
-        switch (res) 
+        switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
     /* check files */
-    if (model_file == nullptr) 
-	{
+    if (model_file == nullptr)
+    {
         fprintf(stderr, "Error: Tengine model file not specified!\n");
         show_usage();
         return -1;
     }
 
-    if (image_file == nullptr) 
-	{
+    if (image_file == nullptr)
+    {
         fprintf(stderr, "Error: Image file not specified!\n");
         show_usage();
         return -1;
@@ -332,31 +333,31 @@ int main(int argc, char *argv[])
     }
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
+    int dims[] = {1, 3, img_h, img_w}; // nchw
     std::vector<uint8_t> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
-    if (input_tensor == nullptr) 
-	{
+    if (input_tensor == nullptr)
+    {
         fprintf(stderr, "Get input tensor failed\n");
         return -1;
     }
 
-    if (set_tensor_shape(input_tensor, dims, 4) < 0) 
-	{
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
         fprintf(stderr, "Set input tensor shape failed\n");
         return -1;
     }
 
     if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0)
-	{
+    {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
     }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
-    if (prerun_graph_multithread(graph, opt) < 0) 
-	{
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
         fprintf(stderr, "Prerun multithread graph failed.\n");
         return -1;
     }
@@ -371,11 +372,11 @@ int main(int argc, char *argv[])
     double min_time = DBL_MAX;
     double max_time = DBL_MIN;
     double total_time = 0.;
-    for (int i = 0; i < repeat_count; i++) 
-	{
+    for (int i = 0; i < repeat_count; i++)
+    {
         double start = get_current_time();
-        if (run_graph(graph, 1) < 0) 
-		{
+        if (run_graph(graph, 1) < 0)
+        {
             fprintf(stderr, "Run graph failed\n");
             return -1;
         }
@@ -386,20 +387,21 @@ int main(int argc, char *argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat [%d] min %.3f ms, max %.3f ms, avg %.3f ms\n", repeat_count, min_time, max_time,
-           total_time / repeat_count);
+            total_time / repeat_count);
 
     /* get output tensor */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor);
-    int output_size = get_tensor_buffer_size(output_tensor)/ sizeof(uint8_t);
+    uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(uint8_t);
     /* dequant */
     float output_scale = 0.f;
     int output_zero_point = 0;
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     // float* output_data = ( float* )malloc(output_size * sizeof(float));
     std::vector<float> output_data(output_size);
-    for (int i = 0; i < output_size; i++) {
-        output_data[i] = ((float) output_u8[i] - (float) output_zero_point) * output_scale;
+    for (int i = 0; i < output_size; i++)
+    {
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
     }
 
     PostProcess(output_data.data(), pose, img_h, img_w);
diff --git a/examples/tm_landmark.cpp b/examples/tm_landmark.cpp
index 74fc95c9a..081a17a43 100644
--- a/examples/tm_landmark.cpp
+++ b/examples/tm_landmark.cpp
@@ -36,7 +36,7 @@ void get_input_fp32_data(const char* image_file, float* input_data, int img_h, i
 {
     image img = imread_process(image_file, img_w, img_h, mean, scale);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
         input_data[i] = image_data[i];
@@ -65,23 +65,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -124,8 +124,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = (float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -182,13 +182,13 @@ int main(int argc, char* argv[])
     /* get output tensor */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
 
-    float* data = ( float* )(get_tensor_buffer(output_tensor));
-    int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float );
+    float* data = (float*)(get_tensor_buffer(output_tensor));
+    int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     image img_out = imread(image_file);
     for (int i = 0; i < data_size / 2; i++)
     {
-        int x = (int)(data[2 * i    ] * (float)img_out.w / 144.f);
+        int x = (int)(data[2 * i] * (float)img_out.w / 144.f);
         int y = (int)(data[2 * i + 1] * (float)img_out.h / 144.f);
         draw_circle(img_out, x, y, 2, 0, 255, 0);
     }
diff --git a/examples/tm_landmark_timvx.cpp b/examples/tm_landmark_timvx.cpp
index 3062f4d8e..08c3901f5 100644
--- a/examples/tm_landmark_timvx.cpp
+++ b/examples/tm_landmark_timvx.cpp
@@ -37,11 +37,11 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h
 {
     image img = imread_process(image_file, img_w, img_h, mean, scale);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
     {
-        int udata = (round)(image_data[i] / input_scale + (float )zero_point);
+        int udata = (round)(image_data[i] / input_scale + (float)zero_point);
         if (udata > 255)
             udata = 255;
         else if (udata < 0)
@@ -74,23 +74,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -146,8 +146,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    uint8_t* input_data = ( uint8_t* )malloc(img_size);
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    uint8_t* input_data = (uint8_t*)malloc(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -210,13 +210,13 @@ int main(int argc, char* argv[])
     float output_scale = 0.f;
     int output_zp = 0;
     get_tensor_quant_param(output_tensor, &output_scale, &output_zp, 1);
-    uint8_t* data = ( uint8_t* )(get_tensor_buffer(output_tensor));
+    uint8_t* data = (uint8_t*)(get_tensor_buffer(output_tensor));
     int data_size = get_tensor_buffer_size(output_tensor) / sizeof(uint8_t);
 
     image img_out = imread(image_file);
     for (int i = 0; i < data_size / 2; i++)
     {
-        int x = (int)(((float)data[2 * i    ] - (float)output_zp) * output_scale * (float)img_out.w / 144.f);
+        int x = (int)(((float)data[2 * i] - (float)output_zp) * output_scale * (float)img_out.w / 144.f);
         int y = (int)(((float)data[2 * i + 1] - (float)output_zp) * output_scale * (float)img_out.h / 144.f);
         draw_circle(img_out, x, y, 2, 0, 255, 0);
     }
diff --git a/examples/tm_landmark_uint8.cpp b/examples/tm_landmark_uint8.cpp
index 4010cc034..af825d9e4 100644
--- a/examples/tm_landmark_uint8.cpp
+++ b/examples/tm_landmark_uint8.cpp
@@ -37,11 +37,11 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h
 {
     image img = imread_process(image_file, img_w, img_h, mean, scale);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
     {
-        int udata = (round)(image_data[i] / input_scale + (float )zero_point);
+        int udata = (round)(image_data[i] / input_scale + (float)zero_point);
         if (udata > 255)
             udata = 255;
         else if (udata < 0)
@@ -74,23 +74,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -133,8 +133,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    uint8_t* input_data = ( uint8_t* )malloc(img_size);
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    uint8_t* input_data = (uint8_t*)malloc(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -197,13 +197,13 @@ int main(int argc, char* argv[])
     float output_scale = 0.f;
     int output_zp = 0;
     get_tensor_quant_param(output_tensor, &output_scale, &output_zp, 1);
-    uint8_t* data = ( uint8_t* )(get_tensor_buffer(output_tensor));
+    uint8_t* data = (uint8_t*)(get_tensor_buffer(output_tensor));
     int data_size = get_tensor_buffer_size(output_tensor) / sizeof(uint8_t);
 
     image img_out = imread(image_file);
     for (int i = 0; i < data_size / 2; i++)
     {
-        int x = (int)(((float)data[2 * i    ] - (float)output_zp) * output_scale * (float)img_out.w / 144.f);
+        int x = (int)(((float)data[2 * i] - (float)output_zp) * output_scale * (float)img_out.w / 144.f);
         int y = (int)(((float)data[2 * i + 1] - (float)output_zp) * output_scale * (float)img_out.h / 144.f);
         draw_circle(img_out, x, y, 2, 0, 255, 0);
     }
diff --git a/examples/tm_mobilefacenet.cpp b/examples/tm_mobilefacenet.cpp
index 5b6b8f841..d7e3020b6 100644
--- a/examples/tm_mobilefacenet.cpp
+++ b/examples/tm_mobilefacenet.cpp
@@ -35,7 +35,7 @@
 #define DEFAULT_MEAN3 122.679
 
 #define MOBILE_FACE_HEIGHT 110
-#define MOBILE_FACE_WIDTH 110
+#define MOBILE_FACE_WIDTH  110
 
 graph_t graph;
 tensor_t input_tensor;
@@ -81,7 +81,7 @@ int getFeature(const char* imagefile, float* feature)
         fprintf(stderr, "run_graph fail");
         return -1;
     }
-    float* data = ( float* )get_tensor_buffer(output_tensor);
+    float* data = (float*)get_tensor_buffer(output_tensor);
     int outsize;
     outsize = get_tensor_buffer_size(output_tensor) / sizeof(float);
     for (int i = 0; i < outsize; i++)
@@ -127,20 +127,20 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'a':
-                person_a = optarg;
-                break;
-            case 'b':
-                person_b = optarg;
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'a':
+            person_a = optarg;
+            break;
+        case 'b':
+            person_b = optarg;
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_mobilefacenet_uint8.cpp b/examples/tm_mobilefacenet_uint8.cpp
index e34e09098..f5756dd3f 100644
--- a/examples/tm_mobilefacenet_uint8.cpp
+++ b/examples/tm_mobilefacenet_uint8.cpp
@@ -30,15 +30,15 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_MEAN1 127.5
-#define DEFAULT_MEAN2 127.5
-#define DEFAULT_MEAN3 127.5
+#define DEFAULT_MEAN1  127.5
+#define DEFAULT_MEAN2  127.5
+#define DEFAULT_MEAN3  127.5
 #define DEFAULT_SCALE1 0.0078
 #define DEFAULT_SCALE2 0.0078
 #define DEFAULT_SCALE3 0.0078
 
 #define MOBILE_FACE_HEIGHT 112
-#define MOBILE_FACE_WIDTH 112
+#define MOBILE_FACE_WIDTH  112
 
 graph_t graph;
 tensor_t input_tensor;
@@ -52,7 +52,7 @@ void init(const char* modelfile)
     opt.num_thread = 1;
     opt.cluster = TENGINE_CLUSTER_ALL;
     opt.precision = TENGINE_MODE_UINT8;
-    opt.affinity = 0x01;    
+    opt.affinity = 0x01;
 
     int dims[4] = {1, 3, MOBILE_FACE_HEIGHT, MOBILE_FACE_WIDTH};
     init_tengine();
@@ -83,7 +83,7 @@ void get_input_uint8_data(const char* image_file, uint8_t* input_data, int img_h
 {
     image img = imread_process(image_file, img_w, img_h, mean, scale);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
     {
@@ -111,7 +111,7 @@ int getFeature(const char* imagefile, float* feature)
 
     float input_scale = 0.f;
     int input_zero_point = 0;
-    get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);    
+    get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
     get_input_uint8_data(imagefile, input_data.data(), height, width, means, scales, input_scale, input_zero_point);
 
     set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(uint8_t));
@@ -123,7 +123,7 @@ int getFeature(const char* imagefile, float* feature)
 
     /* get the result of classification */
     output_tensor = get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor);
+    uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor);
 
     /* dequant */
@@ -131,7 +131,7 @@ int getFeature(const char* imagefile, float* feature)
     int output_zero_point = 0;
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     for (int i = 0; i < output_size; i++)
-        feature[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        feature[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     return output_size;
 }
@@ -174,20 +174,20 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'a':
-                person_a = optarg;
-                break;
-            case 'b':
-                person_b = optarg;
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'a':
+            person_a = optarg;
+            break;
+        case 'b':
+            person_b = optarg;
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_mobilenet_ssd.c b/examples/tm_mobilenet_ssd.c
index 873cb1ddf..f49bc11af 100644
--- a/examples/tm_mobilenet_ssd.c
+++ b/examples/tm_mobilenet_ssd.c
@@ -29,8 +29,8 @@
 #include "tengine_operations.h"
 
 #define DEFAULT_MAX_BOX_COUNT 100
-#define DEFAULT_REPEAT_COUNT    1
-#define DEFAULT_THREAD_COUNT    1
+#define DEFAULT_REPEAT_COUNT  1
+#define DEFAULT_THREAD_COUNT  1
 
 typedef struct Box
 {
@@ -44,10 +44,10 @@ typedef struct Box
 
 void post_process_ssd(const char* image_file, float threshold, const float* outdata, int num)
 {
-    const char* class_names[] = {"background", "aeroplane", "bicycle",   "bird",   "boat",        "bottle",
-                                 "bus",        "car",       "cat",       "chair",  "cow",         "diningtable",
-                                 "dog",        "horse",     "motorbike", "person", "pottedplant", "sheep",
-                                 "sofa",       "train",     "tvmonitor"};
+    const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle",
+                                 "bus", "car", "cat", "chair", "cow", "diningtable",
+                                 "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
+                                 "sofa", "train", "tvmonitor"};
 
     image im = imread(image_file);
 
@@ -117,23 +117,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -176,8 +176,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -233,10 +233,10 @@ int main(int argc, char* argv[])
     fprintf(stderr, "--------------------------------------\n");
 
     /* process the detection result */
-    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);    //"detection_out"
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out"
     int out_dim[4];
     get_tensor_shape(output_tensor, out_dim, 4);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     post_process_ssd(image_file, show_threshold, output_data, out_dim[1]);
 
     /* release tengine */
diff --git a/examples/tm_mobilenet_ssd_acl.c b/examples/tm_mobilenet_ssd_acl.c
index e8ade622c..35e8a5868 100644
--- a/examples/tm_mobilenet_ssd_acl.c
+++ b/examples/tm_mobilenet_ssd_acl.c
@@ -27,8 +27,8 @@
 #include "tengine_operations.h"
 
 #define DEFAULT_MAX_BOX_COUNT 100
-#define DEFAULT_REPEAT_COUNT    1
-#define DEFAULT_THREAD_COUNT    1
+#define DEFAULT_REPEAT_COUNT  1
+#define DEFAULT_THREAD_COUNT  1
 
 typedef struct Box
 {
@@ -42,10 +42,10 @@ typedef struct Box
 
 void post_process_ssd(const char* image_file, float threshold, const float* outdata, int num)
 {
-    const char* class_names[] = {"background", "aeroplane", "bicycle",   "bird",   "boat",        "bottle",
-                                 "bus",        "car",       "cat",       "chair",  "cow",         "diningtable",
-                                 "dog",        "horse",     "motorbike", "person", "pottedplant", "sheep",
-                                 "sofa",       "train",     "tvmonitor"};
+    const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle",
+                                 "bus", "car", "cat", "chair", "cow", "diningtable",
+                                 "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
+                                 "sofa", "train", "tvmonitor"};
 
     image im = imread(image_file);
 
@@ -115,23 +115,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -187,8 +187,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -207,7 +207,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -244,10 +244,10 @@ int main(int argc, char* argv[])
     fprintf(stderr, "--------------------------------------\n");
 
     /* process the detection result */
-    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);    //"detection_out"
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out"
     int out_dim[4];
     get_tensor_shape(output_tensor, out_dim, 4);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     post_process_ssd(image_file, show_threshold, output_data, out_dim[1]);
 
     /* release tengine */
diff --git a/examples/tm_mobilenet_ssd_uint8.cpp b/examples/tm_mobilenet_ssd_uint8.cpp
index ced0319f7..6420b4a9a 100644
--- a/examples/tm_mobilenet_ssd_uint8.cpp
+++ b/examples/tm_mobilenet_ssd_uint8.cpp
@@ -49,7 +49,7 @@ void get_input_uint_data_ssd(const char* image_file, uint8_t* input_data, int im
     float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f};
     image img = imread_process(image_file, img_w, img_h, mean, scales);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
     {
@@ -67,10 +67,10 @@ void get_input_uint_data_ssd(const char* image_file, uint8_t* input_data, int im
 
 void post_process_ssd(const char* image_file, float threshold, float* outdata, int num)
 {
-    const char* class_names[] = {"background", "aeroplane", "bicycle",   "bird",   "boat",        "bottle",
-                                 "bus",        "car",       "cat",       "chair",  "cow",         "diningtable",
-                                 "dog",        "horse",     "motorbike", "person", "pottedplant", "sheep",
-                                 "sofa",       "train",     "tvmonitor"};
+    const char* class_names[] = {"background", "aeroplane", "bicycle", "bird", "boat", "bottle",
+                                 "bus", "car", "cat", "chair", "cow", "diningtable",
+                                 "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
+                                 "sofa", "train", "tvmonitor"};
 
     image im = imread(image_file);
 
@@ -91,11 +91,11 @@ void post_process_ssd(const char* image_file, float threshold, float* outdata, i
             box.y1 = outdata[5] * raw_h;
             boxes.push_back(box);
             printf("%s\t:%.2f\n", class_names[box.class_idx], box.score * 100.f);
-            printf("BOX:( %d , %d ),( %d , %d )\n", ( int )box.x0, ( int )box.y0, ( int )box.x1, ( int )box.y1);
+            printf("BOX:( %d , %d ),( %d , %d )\n", (int)box.x0, (int)box.y0, (int)box.x1, (int)box.y1);
         }
         outdata += 6;
     }
-    for (int i = 0; i < ( int )boxes.size(); i++)
+    for (int i = 0; i < (int)boxes.size(); i++)
     {
         Box box = boxes[i];
         draw_box(im, box.x0, box.y0, box.x1, box.y1, 2, 125, 0, 125);
@@ -131,23 +131,23 @@ int main(int argc, char* argv[])
     {
         switch (ret)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -193,8 +193,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    uint8_t* input_data = ( uint8_t* )malloc(img_size * sizeof(uint8_t));    
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    uint8_t* input_data = (uint8_t*)malloc(img_size * sizeof(uint8_t));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -213,7 +213,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -225,7 +225,7 @@ int main(int argc, char* argv[])
     /* prepare process input data, set the data mem to input tensor */
     float input_scale = 0.f;
     int input_zero_point = 0;
-    get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);    
+    get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
     get_input_uint_data_ssd(image_file, input_data, img_h, img_w, input_scale, input_zero_point);
 
     /* run graph */
@@ -253,19 +253,19 @@ int main(int argc, char* argv[])
     fprintf(stderr, "--------------------------------------\n");
 
     /* process the detection result */
-    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);    //"detection_out"
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out"
     int out_dim[4];
     get_tensor_shape(output_tensor, out_dim, 4);
     int output_size = get_tensor_buffer_size(output_tensor);
-    uint8_t* output_u8 = ( uint8_t* )get_tensor_buffer(output_tensor);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(output_tensor);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
 
     /* dequant */
     float output_scale = 0.f;
     int output_zero_point = 0;
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);    
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - (float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     /* post_process_ssd */
     post_process_ssd(image_file, show_threshold, output_data, out_dim[1]);
diff --git a/examples/tm_nanodet_m.cpp b/examples/tm_nanodet_m.cpp
index 5c614e7d5..16aa0a3ba 100644
--- a/examples/tm_nanodet_m.cpp
+++ b/examples/tm_nanodet_m.cpp
@@ -42,13 +42,12 @@
 #include "tengine_operations.h"
 
 // tengine output tensor names
-const char *cls_pred_name[] = {
-    "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32"
-};
-const char *dis_pred_name[] = {
+const char* cls_pred_name[] = {
+    "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32"};
+const char* dis_pred_name[] = {
 #ifdef TRY_POST_SOFTMAX
     "dis_pred_stride_8", "dis_pred_stride_16", "dis_pred_stride_32"
-#else /* !TRY_POST_SOFTMAX */
+#else  /* !TRY_POST_SOFTMAX */
     "dis_sm_stride_8", "dis_sm_stride_16", "dis_sm_stride_32"
 #endif /* TRY_POST_SOFTMAX */
 };
@@ -60,8 +59,10 @@ struct Object
     float prob;
 };
 
-static __inline float fast_exp(float x) {
-    union {
+static __inline float fast_exp(float x)
+{
+    union
+    {
         uint32_t i;
         float f;
     } v{};
@@ -70,16 +71,19 @@ static __inline float fast_exp(float x) {
 }
 
 template<typename _Tp>
-static int softmax(const _Tp* src, _Tp* dst, int length) {
+static int softmax(const _Tp* src, _Tp* dst, int length)
+{
     const _Tp max_value = *std::max_element(src, src + length);
-    _Tp denominator{ 0 };
- 
-    for (int i = 0; i < length; ++i) {
-        dst[i] = std::exp/*fast_exp*/(src[i] - max_value);
+    _Tp denominator{0};
+
+    for (int i = 0; i < length; ++i)
+    {
+        dst[i] = std::exp /*fast_exp*/ (src[i] - max_value);
         denominator += dst[i];
     }
- 
-    for (int i = 0; i < length; ++i) {
+
+    for (int i = 0; i < length; ++i)
+    {
         dst[i] /= denominator;
     }
 
@@ -178,8 +182,9 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
 // @param:  in_pad[in]   as letter box's shape
 // @param:  prob_threshold[in]
 // @param:  objects[out] output detected objects
-static void generate_proposals(const float *cls_pred, const float *dis_pred, int stride,
-    const image &in_pad, float prob_threshold, std::vector<Object>& objects) {
+static void generate_proposals(const float* cls_pred, const float* dis_pred, int stride,
+                               const image& in_pad, float prob_threshold, std::vector<Object>& objects)
+{
     const int num_grid_x = in_pad.w / stride;
     const int num_grid_y = in_pad.h / stride;
     // Note: Here, we hard coded some model parameters for simplicity.
@@ -188,37 +193,44 @@ static void generate_proposals(const float *cls_pred, const float *dis_pred, int
     // Discrete distribution parameter, see the following resources for more details:
     // [nanodet-m.yml](https://github.com/RangiLyu/nanodet/blob/main/config/nanodet-m.yml)
     // [GFL](https://arxiv.org/pdf/2006.04388.pdf)
-    const int reg_max_1 = 8;  // 32 / 4;
+    const int reg_max_1 = 8; // 32 / 4;
 
-    for (int i = 0; i < num_grid_y; i++) {
-        for (int j = 0; j < num_grid_x; j++) {
+    for (int i = 0; i < num_grid_y; i++)
+    {
+        for (int j = 0; j < num_grid_x; j++)
+        {
             const int idx = i * num_grid_x + j;
 
-            const float *scores = cls_pred + idx * num_class;
+            const float* scores = cls_pred + idx * num_class;
 
             // find label with max score
             int label = -1;
             float score = -FLT_MAX;
-            for (int k = 0; k < num_class; k++) {
-                if (scores[k] > score) {
+            for (int k = 0; k < num_class; k++)
+            {
+                if (scores[k] > score)
+                {
                     label = k;
                     score = scores[k];
                 }
             }
 
-            if (score >= prob_threshold) {
+            if (score >= prob_threshold)
+            {
                 float pred_ltrb[4];
-                for (int k = 0; k < 4; k++) {
+                for (int k = 0; k < 4; k++)
+                {
                     float dis = 0.f;
                     // predicted distance distribution after softmax
 #ifdef TRY_POST_SOFTMAX
-                    float dis_after_sm[8] = { 0. };
+                    float dis_after_sm[8] = {0.};
                     softmax(dis_pred + idx * reg_max_1 * 4 + k * reg_max_1, dis_after_sm, 8);
-#else /* !TRY_POST_SOFTMAX */
-                    const float *dis_after_sm = dis_pred + idx * reg_max_1 * 4 + k * reg_max_1;
+#else  /* !TRY_POST_SOFTMAX */
+                    const float* dis_after_sm = dis_pred + idx * reg_max_1 * 4 + k * reg_max_1;
 #endif /* TRY_POST_SOFTMAX */
                     // integral on predicted discrete distribution
-                    for (int l = 0; l < reg_max_1; l++) {
+                    for (int l = 0; l < reg_max_1; l++)
+                    {
                         dis += l * dis_after_sm[l];
                         //printf("%2.6f ", dis_after_sm[l]);
                     }
@@ -250,19 +262,18 @@ static void generate_proposals(const float *cls_pred, const float *dis_pred, int
     }
 }
 
-static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, const char *path)
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, const char* path)
 {
     static const char* class_names[] = {
-            "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
-            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
-            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
-            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
-            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
-            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-            "hair drier", "toothbrush"
-    };
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -271,8 +282,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects,
         const Object& obj = objects[i];
 
         fprintf(stderr, "%2d: %3.3f%%, [%7.3f, %7.3f, %7.3f, %7.3f], %s\n",
-            obj.label, obj.prob * 100, obj.rect.x, obj.rect.y,
-            obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]);
+                obj.label, obj.prob * 100, obj.rect.x, obj.rect.y,
+                obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]);
 
         cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
 
@@ -311,10 +322,13 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects,
 /// @param norm norm values per channel
 static void nhwc_to_nchw(float* src, float* dst, int h_limit, int w_limit, int c_limit, const float* mean, const float* norm)
 {
-    for (int h = 0; h < h_limit; h++) {
-        for (int w = 0; w < w_limit; w++) {
-            for (int c = 0; c < 3; c++) {
-                int in_index  = h * w_limit * 3 + w * 3 + c;
+    for (int h = 0; h < h_limit; h++)
+    {
+        for (int w = 0; w < w_limit; w++)
+        {
+            for (int c = 0; c < 3; c++)
+            {
+                int in_index = h * w_limit * 3 + w * 3 + c;
                 int out_index = c * h_limit * w_limit + h * w_limit + w;
                 dst[out_index] = (src[in_index] - mean[c]) * norm[c];
             }
@@ -323,17 +337,19 @@ static void nhwc_to_nchw(float* src, float* dst, int h_limit, int w_limit, int c
 }
 
 // @brief:  get input data and resize to model input shape directly
-static int get_input_data(const char *path, const float *mean, const float *norm, image &lb) {
+static int get_input_data(const char* path, const float* mean, const float* norm, image& lb)
+{
     // load input image
     cv::Mat img = cv::imread(path, 1);
-    if (img.empty()) {
+    if (img.empty())
+    {
         fprintf(stderr, "cv::imread %s failed\n", path);
         return -1;
     }
 
     if (img.cols != lb.w || img.rows != lb.h) cv::resize(img, img, cv::Size(lb.w, lb.h));
     img.convertTo(img, CV_32FC3);
-    float *_data = (float *)img.data;
+    float* _data = (float*)img.data;
 
     nhwc_to_nchw(_data, lb.data, lb.h, lb.w, 3, mean, norm);
     return 0;
@@ -343,10 +359,12 @@ static int get_input_data(const char *path, const float *mean, const float *norm
 // @param:  lb[in/out]  letter box image inst
 // @param:  pad[out]    top and left pad size
 // @return: resize scale from origin image to letter box
-static float get_input_data(const char *path, const float *mean, const float *norm, image &lb, image &pad) {
+static float get_input_data(const char* path, const float* mean, const float* norm, image& lb, image& pad)
+{
     // load input image
     cv::Mat img = cv::imread(path, 1);
-    if (img.empty()) {
+    if (img.empty())
+    {
         fprintf(stderr, "cv::imread %s failed\n", path);
         return -1.;
     }
@@ -365,29 +383,31 @@ static float get_input_data(const char *path, const float *mean, const float *no
     pad.h = lb.h - h; //(h + 31) / 32 * 32 - h;
     // Generate a gray image using opencv
     cv::Mat img_pad(lb.w, lb.h, CV_32FC3, //cv::Scalar(0));
-        cv::Scalar(0.5/norm[0] + mean[0], 0.5/norm[1] + mean[1], 0.5/norm[2] + mean[2]));
+                    cv::Scalar(0.5 / norm[0] + mean[0], 0.5 / norm[1] + mean[1], 0.5 / norm[2] + mean[2]));
     // Letterbox filling
-    cv::copyMakeBorder(img, img_pad, pad.h/2, pad.h - pad.h/2, pad.w/2, pad.w - pad.w/2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+    cv::copyMakeBorder(img, img_pad, pad.h / 2, pad.h - pad.h / 2, pad.w / 2, pad.w - pad.w / 2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
 
     img_pad.convertTo(img_pad, CV_32FC3);
-    float *_data = (float *)img_pad.data;
+    float* _data = (float*)img_pad.data;
     nhwc_to_nchw(_data, lb.data, lb.h, lb.w, 3, mean, norm);
 
     return lb_scale;
 }
 
-static void show_usage() {
+static void show_usage()
+{
     fprintf(stderr, "[Usage]: [-h]\n");
     fprintf(stderr, "   [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count] [-o output_file]\n");
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char* argv[])
+{
     const char* model_file = nullptr;
     const char* image_file = nullptr;
     const char* output_file = "nanodet_m_out.jpg";
 
-    const float mean[3] = { 103.53f, 116.28f, 123.675f }; // bgr
-    const float norm[3] = { 0.017429f, 0.017507f, 0.017125f };
+    const float mean[3] = {103.53f, 116.28f, 123.675f}; // bgr
+    const float norm[3] = {0.017429f, 0.017507f, 0.017125f};
 
     int repeat_count = 1;
     int num_thread = 1;
@@ -396,38 +416,42 @@ int main(int argc, char* argv[]) {
     const float nms_threshold = 0.5f;
 
     int res;
-    while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1) {
-        switch (res) {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'o':
-                output_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+    while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1)
+    {
+        switch (res)
+        {
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'o':
+            output_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
     /* check files */
-    if (nullptr == model_file || nullptr == image_file) {
+    if (nullptr == model_file || nullptr == image_file)
+    {
         fprintf(stderr, "Error: Tengine model file not specified!\n");
         show_usage();
         return -1;
     }
-    if (!check_file_exist(model_file) || !check_file_exist(image_file)) {
+    if (!check_file_exist(model_file) || !check_file_exist(image_file))
+    {
         return -1;
     }
 
@@ -439,7 +463,8 @@ int main(int argc, char* argv[]) {
     opt.affinity = 0;
 
     /* inital tengine */
-    if (0 != init_tengine()) {
+    if (0 != init_tengine())
+    {
         fprintf(stderr, "Initial tengine failed.\n");
         return -1;
     }
@@ -447,14 +472,16 @@ int main(int argc, char* argv[]) {
 
     /* create graph, load tengine model xxx.tmfile */
     graph_t graph = create_graph(nullptr, "tengine", model_file);
-    if (nullptr == graph) {
+    if (nullptr == graph)
+    {
         fprintf(stderr, "Create graph failed.\n");
         return -1;
     }
 
     /* get input tensor of graph */
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
-    if (nullptr == input_tensor) {
+    if (nullptr == input_tensor)
+    {
         fprintf(stderr, "Get input tensor failed\n");
         return -1;
     }
@@ -462,7 +489,8 @@ int main(int argc, char* argv[]) {
     /* get shape of input tensor */
     int i, dims[4]; // nchw
     int dim_num = get_tensor_shape(input_tensor, dims, 4);
-    if (4 != dim_num) {
+    if (4 != dim_num)
+    {
         fprintf(stderr, "Get input tensor shape error\n");
         return -1;
     }
@@ -473,18 +501,20 @@ int main(int argc, char* argv[]) {
 #ifdef TRY_LETTER_BOX
     image pad = make_empty_image(lb.w, lb.h, lb.c);
     float lb_scale = get_input_data(image_file, mean, norm, lb, pad);
-#else /* !TRY_LETTER_BOX */
+#else  /* !TRY_LETTER_BOX */
     get_input_data(image_file, mean, norm, lb);
 #endif /* TRY_LETTER_BOX */
 
     /* set the data mem to input tensor */
-    if (set_tensor_buffer(input_tensor, lb.data, img_size * sizeof(float)) < 0) {
+    if (set_tensor_buffer(input_tensor, lb.data, img_size * sizeof(float)) < 0)
+    {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
     }
 
     /* prerun graph to infer shape, and set work options(num_thread, cluster, precision) */
-    if (prerun_graph_multithread(graph, opt) < 0) {
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
         fprintf(stderr, "Prerun multithread graph failed.\n");
         return -1;
     }
@@ -493,9 +523,11 @@ int main(int argc, char* argv[]) {
     double min_time = DBL_MAX;
     double max_time = DBL_MIN;
     double total_time = 0.;
-    for (i = 0; i < repeat_count; i++) {
+    for (i = 0; i < repeat_count; i++)
+    {
         double start = get_current_time();
-        if (run_graph(graph, 1) < 0) {
+        if (run_graph(graph, 1) < 0)
+        {
             fprintf(stderr, "Run graph failed\n");
             return -1;
         }
@@ -506,22 +538,24 @@ int main(int argc, char* argv[]) {
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* nanodet_m postprocess */
     std::vector<Object> proposals, objects;
-    for (int stride_index = 0; stride_index < 3; stride_index++) {
+    for (int stride_index = 0; stride_index < 3; stride_index++)
+    {
         tensor_t cls_tensor = get_graph_tensor(graph, cls_pred_name[stride_index]);
         tensor_t dis_tensor = get_graph_tensor(graph, dis_pred_name[stride_index]);
-        if (NULL == cls_tensor || NULL ==dis_tensor) {
+        if (NULL == cls_tensor || NULL == dis_tensor)
+        {
             fprintf(stderr, "get graph tensor failed\n");
             return -1;
         }
-        const float *cls_pred = (const float *)get_tensor_buffer(cls_tensor);
-        const float *dis_pred = (const float *)get_tensor_buffer(dis_tensor);
+        const float* cls_pred = (const float*)get_tensor_buffer(cls_tensor);
+        const float* dis_pred = (const float*)get_tensor_buffer(dis_tensor);
         generate_proposals(cls_pred, dis_pred, 1 << (stride_index + 3),
-            lb, prob_threshold, objects);
+                           lb, prob_threshold, objects);
         proposals.insert(proposals.end(), objects.begin(), objects.end());
     }
 
@@ -534,10 +568,11 @@ int main(int argc, char* argv[]) {
 
     cv::Mat img = cv::imread(image_file);
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < count; i++)
+    {
         objects[i] = proposals[picked[i]];
 
 #ifdef TRY_LETTER_BOX
@@ -546,7 +581,7 @@ int main(int argc, char* argv[]) {
         float y0 = (objects[i].rect.y - (pad.h / 2)) / lb_scale;
         float x1 = (objects[i].rect.x + objects[i].rect.width - (pad.w / 2)) / lb_scale;
         float y1 = (objects[i].rect.y + objects[i].rect.height - (pad.h / 2)) / lb_scale;
-#else /* !TRY_LETTER_BOX */
+#else  /* !TRY_LETTER_BOX */
         // adjust offset to original unresized
         static float lb_scale_w = 1. * lb.w / img.cols;
         static float lb_scale_h = 1. * lb.h / img.rows;
@@ -576,4 +611,3 @@ int main(int argc, char* argv[]) {
     release_tengine();
     return 0;
 }
-
diff --git a/examples/tm_nanodet_m_timvx.cpp b/examples/tm_nanodet_m_timvx.cpp
index ed9471d75..dc1edeb81 100644
--- a/examples/tm_nanodet_m_timvx.cpp
+++ b/examples/tm_nanodet_m_timvx.cpp
@@ -42,13 +42,12 @@
 #include "tengine_operations.h"
 
 // tengine output tensor names
-const char *cls_pred_name[] = {
-    "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32"
-};
-const char *dis_pred_name[] = {
+const char* cls_pred_name[] = {
+    "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32"};
+const char* dis_pred_name[] = {
 #ifdef TRY_POST_SOFTMAX
     "dis_pred_stride_8", "dis_pred_stride_16", "dis_pred_stride_32"
-#else /* !TRY_POST_SOFTMAX */
+#else  /* !TRY_POST_SOFTMAX */
     "dis_sm_stride_8", "dis_sm_stride_16", "dis_sm_stride_32"
 #endif /* TRY_POST_SOFTMAX */
 };
@@ -60,8 +59,10 @@ struct Object
     float prob;
 };
 
-static __inline float fast_exp(float x) {
-    union {
+static __inline float fast_exp(float x)
+{
+    union
+    {
         uint32_t i;
         float f;
     } v{};
@@ -70,16 +71,19 @@ static __inline float fast_exp(float x) {
 }
 
 template<typename _Tp>
-static int softmax(const _Tp* src, _Tp* dst, int length) {
+static int softmax(const _Tp* src, _Tp* dst, int length)
+{
     const _Tp max_value = *std::max_element(src, src + length);
-    _Tp denominator{ 0 };
- 
-    for (int i = 0; i < length; ++i) {
-        dst[i] = std::exp/*fast_exp*/(src[i] - max_value);
+    _Tp denominator{0};
+
+    for (int i = 0; i < length; ++i)
+    {
+        dst[i] = std::exp /*fast_exp*/ (src[i] - max_value);
         denominator += dst[i];
     }
- 
-    for (int i = 0; i < length; ++i) {
+
+    for (int i = 0; i < length; ++i)
+    {
         dst[i] /= denominator;
     }
 
@@ -178,8 +182,9 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
 // @param:  in_pad[in]   as letter box's shape
 // @param:  prob_threshold[in]
 // @param:  objects[out] output detected objects
-static void generate_proposals(const float *cls_pred, const float *dis_pred, int stride,
-    const image &in_pad, float prob_threshold, std::vector<Object>& objects) {
+static void generate_proposals(const float* cls_pred, const float* dis_pred, int stride,
+                               const image& in_pad, float prob_threshold, std::vector<Object>& objects)
+{
     const int num_grid_x = in_pad.w / stride;
     const int num_grid_y = in_pad.h / stride;
     // Note: Here, we hard coded some model parameters for simplicity.
@@ -188,37 +193,44 @@ static void generate_proposals(const float *cls_pred, const float *dis_pred, int
     // Discrete distribution parameter, see the following resources for more details:
     // [nanodet-m.yml](https://github.com/RangiLyu/nanodet/blob/main/config/nanodet-m.yml)
     // [GFL](https://arxiv.org/pdf/2006.04388.pdf)
-    const int reg_max_1 = 8;  // 32 / 4;
+    const int reg_max_1 = 8; // 32 / 4;
 
-    for (int i = 0; i < num_grid_y; i++) {
-        for (int j = 0; j < num_grid_x; j++) {
+    for (int i = 0; i < num_grid_y; i++)
+    {
+        for (int j = 0; j < num_grid_x; j++)
+        {
             const int idx = i * num_grid_x + j;
 
-            const float *scores = cls_pred + idx * num_class;
+            const float* scores = cls_pred + idx * num_class;
 
             // find label with max score
             int label = -1;
             float score = -FLT_MAX;
-            for (int k = 0; k < num_class; k++) {
-                if (scores[k] > score) {
+            for (int k = 0; k < num_class; k++)
+            {
+                if (scores[k] > score)
+                {
                     label = k;
                     score = scores[k];
                 }
             }
 
-            if (score >= prob_threshold) {
+            if (score >= prob_threshold)
+            {
                 float pred_ltrb[4];
-                for (int k = 0; k < 4; k++) {
+                for (int k = 0; k < 4; k++)
+                {
                     float dis = 0.f;
                     // predicted distance distribution after softmax
 #ifdef TRY_POST_SOFTMAX
-                    float dis_after_sm[8] = { 0. };
+                    float dis_after_sm[8] = {0.};
                     softmax(dis_pred + idx * reg_max_1 * 4 + k * reg_max_1, dis_after_sm, 8);
-#else /* !TRY_POST_SOFTMAX */
-                    const float *dis_after_sm = dis_pred + idx * reg_max_1 * 4 + k * reg_max_1;
+#else  /* !TRY_POST_SOFTMAX */
+                    const float* dis_after_sm = dis_pred + idx * reg_max_1 * 4 + k * reg_max_1;
 #endif /* TRY_POST_SOFTMAX */
                     // integral on predicted discrete distribution
-                    for (int l = 0; l < reg_max_1; l++) {
+                    for (int l = 0; l < reg_max_1; l++)
+                    {
                         dis += l * dis_after_sm[l];
                         //printf("%2.6f ", dis_after_sm[l]);
                     }
@@ -250,19 +262,18 @@ static void generate_proposals(const float *cls_pred, const float *dis_pred, int
     }
 }
 
-static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, const char *path)
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, const char* path)
 {
     static const char* class_names[] = {
-            "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
-            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
-            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
-            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
-            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
-            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-            "hair drier", "toothbrush"
-    };
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -271,8 +282,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects,
         const Object& obj = objects[i];
 
         fprintf(stderr, "%2d: %3.3f%%, [%7.3f, %7.3f, %7.3f, %7.3f], %s\n",
-            obj.label, obj.prob * 100, obj.rect.x, obj.rect.y,
-            obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]);
+                obj.label, obj.prob * 100, obj.rect.x, obj.rect.y,
+                obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]);
 
         cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
 
@@ -302,10 +313,12 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects,
 }
 
 // @brief:  get input data and resize to model input shape directly
-static int get_input_data(const char *path, const float *mean, const float *norm, image &lb) {
+static int get_input_data(const char* path, const float* mean, const float* norm, image& lb)
+{
     // load input image
     cv::Mat img = cv::imread(path, 1);
-    if (img.empty()) {
+    if (img.empty())
+    {
         fprintf(stderr, "cv::imread %s failed\n", path);
         return -1;
     }
@@ -314,11 +327,14 @@ static int get_input_data(const char *path, const float *mean, const float *norm
     img.convertTo(img, CV_32FC3);
 
     /* nhwc to nchw */
-    float *_data = (float *)img.data;
-    for (int h = 0; h < lb.h; h++) {
-        for (int w = 0; w < lb.w; w++) {
-            for (int c = 0; c < 3; c++) {
-                int in_index  = h * lb.w * 3 + w * 3 + c;
+    float* _data = (float*)img.data;
+    for (int h = 0; h < lb.h; h++)
+    {
+        for (int w = 0; w < lb.w; w++)
+        {
+            for (int c = 0; c < 3; c++)
+            {
+                int in_index = h * lb.w * 3 + w * 3 + c;
                 int out_index = c * lb.h * lb.w + h * lb.w + w;
                 lb.data[out_index] = (_data[in_index] - mean[c]) * norm[c];
             }
@@ -331,10 +347,12 @@ static int get_input_data(const char *path, const float *mean, const float *norm
 // @param:  lb[in/out]  letter box image inst
 // @param:  pad[out]    top and left pad size
 // @return: resize scale from origin image to letter box
-static float get_input_data(const char *path, const float *mean, const float *norm, image &lb, image &pad) {
+static float get_input_data(const char* path, const float* mean, const float* norm, image& lb, image& pad)
+{
     // load input image
     cv::Mat img = cv::imread(path, 1);
-    if (img.empty()) {
+    if (img.empty())
+    {
         fprintf(stderr, "cv::imread %s failed\n", path);
         return -1.;
     }
@@ -353,17 +371,20 @@ static float get_input_data(const char *path, const float *mean, const float *no
     pad.h = lb.h - h; //(h + 31) / 32 * 32 - h;
     // Generate a gray image using opencv
     cv::Mat img_pad(lb.w, lb.h, CV_32FC3, //cv::Scalar(0));
-        cv::Scalar(0.5/norm[0] + mean[0], 0.5/norm[0] + mean[0], 0.5/norm[2] + mean[2]));
+                    cv::Scalar(0.5 / norm[0] + mean[0], 0.5 / norm[0] + mean[0], 0.5 / norm[2] + mean[2]));
     // Letterbox filling
-    cv::copyMakeBorder(img, img_pad, pad.h/2, pad.h - pad.h/2, pad.w/2, pad.w - pad.w/2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
+    cv::copyMakeBorder(img, img_pad, pad.h / 2, pad.h - pad.h / 2, pad.w / 2, pad.w - pad.w / 2, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
 
     img_pad.convertTo(img_pad, CV_32FC3);
-    float *_data = (float *)img_pad.data;
+    float* _data = (float*)img_pad.data;
     /* nhwc to nchw */
-    for (int h = 0; h < lb.h; h++) {
-        for (int w = 0; w < lb.w; w++) {
-            for (int c = 0; c < 3; c++) {
-                int in_index  = h * lb.w * 3 + w * 3 + c;
+    for (int h = 0; h < lb.h; h++)
+    {
+        for (int w = 0; w < lb.w; w++)
+        {
+            for (int c = 0; c < 3; c++)
+            {
+                int in_index = h * lb.w * 3 + w * 3 + c;
                 int out_index = c * lb.h * lb.w + h * lb.w + w;
                 lb.data[out_index] = (_data[in_index] - mean[c]) * norm[c];
             }
@@ -373,7 +394,8 @@ static float get_input_data(const char *path, const float *mean, const float *no
     return lb_scale;
 }
 
-static void show_usage() {
+static void show_usage()
+{
     fprintf(stderr, "[Usage]: [-h]\n");
     fprintf(stderr, "   [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count] [-o output_file]\n");
 }
@@ -392,13 +414,14 @@ void get_input_uint8_data(float* input_fp32, uint8_t* input_data, int size, floa
     }
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char* argv[])
+{
     const char* model_file = nullptr;
     const char* image_file = nullptr;
     const char* output_file = "nanodet_m_uint8_out.jpg";
 
-    const float mean[3] = { 103.53f, 116.28f, 123.675f }; // bgr
-    const float norm[3] = { 0.017429f, 0.017507f, 0.017125f };
+    const float mean[3] = {103.53f, 116.28f, 123.675f}; // bgr
+    const float norm[3] = {0.017429f, 0.017507f, 0.017125f};
 
     int repeat_count = 1;
     int num_thread = 1;
@@ -407,38 +430,42 @@ int main(int argc, char* argv[]) {
     const float nms_threshold = 0.5f;
 
     int res;
-    while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1) {
-        switch (res) {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'o':
-                output_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+    while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1)
+    {
+        switch (res)
+        {
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'o':
+            output_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
     /* check files */
-    if (nullptr == model_file || nullptr == image_file) {
+    if (nullptr == model_file || nullptr == image_file)
+    {
         fprintf(stderr, "Error: Tengine model file not specified!\n");
         show_usage();
         return -1;
     }
-    if (!check_file_exist(model_file) || !check_file_exist(image_file)) {
+    if (!check_file_exist(model_file) || !check_file_exist(image_file))
+    {
         return -1;
     }
 
@@ -450,7 +477,8 @@ int main(int argc, char* argv[]) {
     opt.affinity = 0;
 
     /* inital tengine */
-    if (0 != init_tengine()) {
+    if (0 != init_tengine())
+    {
         fprintf(stderr, "Initial tengine failed.\n");
         return -1;
     }
@@ -466,14 +494,16 @@ int main(int argc, char* argv[]) {
     }
     /* create graph, load tengine model xxx.tmfile */
     graph_t graph = create_graph(timvx_context, "tengine", model_file);
-    if (nullptr == graph) {
+    if (nullptr == graph)
+    {
         fprintf(stderr, "Create graph failed.\n");
         return -1;
     }
 
     /* get input tensor of graph */
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
-    if (nullptr == input_tensor) {
+    if (nullptr == input_tensor)
+    {
         fprintf(stderr, "Get input tensor failed\n");
         return -1;
     }
@@ -481,7 +511,8 @@ int main(int argc, char* argv[]) {
     /* get shape of input tensor */
     int i, dims[4]; // nchw
     int dim_num = get_tensor_shape(input_tensor, dims, 4);
-    if (4 != dim_num) {
+    if (4 != dim_num)
+    {
         fprintf(stderr, "Get input tensor shape error\n");
         return -1;
     }
@@ -492,7 +523,7 @@ int main(int argc, char* argv[]) {
 #ifdef TRY_LETTER_BOX
     image pad = make_empty_image(lb.w, lb.h, lb.c);
     float lb_scale = get_input_data(image_file, mean, norm, lb, pad);
-#else /* !TRY_LETTER_BOX */
+#else  /* !TRY_LETTER_BOX */
     get_input_data(image_file, mean, norm, lb);
 #endif /* TRY_LETTER_BOX */
 
@@ -504,13 +535,15 @@ int main(int argc, char* argv[]) {
     get_input_uint8_data(lb.data, input_data.data(), img_size, input_scale, input_zero_point);
 
     /* set the data mem to input tensor */
-    if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0) {
+    if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0)
+    {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
     }
 
     /* prerun graph to infer shape, and set work options(num_thread, cluster, precision) */
-    if (prerun_graph_multithread(graph, opt) < 0) {
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
         fprintf(stderr, "Prerun multithread graph failed.\n");
         return -1;
     }
@@ -519,9 +552,11 @@ int main(int argc, char* argv[]) {
     double min_time = DBL_MAX;
     double max_time = DBL_MIN;
     double total_time = 0.;
-    for (int i = 0; i < repeat_count; i++) {
+    for (int i = 0; i < repeat_count; i++)
+    {
         double start = get_current_time();
-        if (run_graph(graph, 1) < 0) {
+        if (run_graph(graph, 1) < 0)
+        {
             fprintf(stderr, "Run graph failed\n");
             return -1;
         }
@@ -532,39 +567,39 @@ int main(int argc, char* argv[]) {
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* nanodet_m postprocess */
     std::vector<Object> proposals, objects;
-    for (int stride_index = 0; stride_index < 3; stride_index++) 
+    for (int stride_index = 0; stride_index < 3; stride_index++)
     {
         tensor_t cls_tensor = get_graph_tensor(graph, cls_pred_name[stride_index]);
         tensor_t dis_tensor = get_graph_tensor(graph, dis_pred_name[stride_index]);
 
-        int cls_count  = get_tensor_buffer_size(cls_tensor) / sizeof(uint8_t);  
-        int dis_count  = get_tensor_buffer_size(dis_tensor) / sizeof(uint8_t);  
+        int cls_count = get_tensor_buffer_size(cls_tensor) / sizeof(uint8_t);
+        int dis_count = get_tensor_buffer_size(dis_tensor) / sizeof(uint8_t);
 
-        float cls_scale  = 0.f;
-        float dis_scale  = 0.f;
-        int cls_zero_point  = 0;
-        int dis_zero_point  = 0;
+        float cls_scale = 0.f;
+        float dis_scale = 0.f;
+        int cls_zero_point = 0;
+        int dis_zero_point = 0;
 
         get_tensor_quant_param(cls_tensor, &cls_scale, &cls_zero_point, 1);
         get_tensor_quant_param(dis_tensor, &dis_scale, &dis_zero_point, 1);
-        
-        const uint8_t *cls_pred_u8 = (const uint8_t *)get_tensor_buffer(cls_tensor);
-        const uint8_t *dis_pred_u8 = (const uint8_t *)get_tensor_buffer(dis_tensor);
+
+        const uint8_t* cls_pred_u8 = (const uint8_t*)get_tensor_buffer(cls_tensor);
+        const uint8_t* dis_pred_u8 = (const uint8_t*)get_tensor_buffer(dis_tensor);
 
         std::vector<float> cls_pred(cls_count);
         std::vector<float> dis_pred(dis_count);
 
         for (int c = 0; c < cls_count; c++)
-            cls_pred[c] = (( float )cls_pred_u8[c] - ( float )cls_zero_point) * cls_scale;
+            cls_pred[c] = ((float)cls_pred_u8[c] - (float)cls_zero_point) * cls_scale;
 
         for (int c = 0; c < dis_count; c++)
-            dis_pred[c] = (( float )dis_pred_u8[c] - ( float )dis_zero_point) * dis_scale;      
-        
+            dis_pred[c] = ((float)dis_pred_u8[c] - (float)dis_zero_point) * dis_scale;
+
         generate_proposals(cls_pred.data(), dis_pred.data(), 1 << (stride_index + 3), lb, prob_threshold, objects);
         proposals.insert(proposals.end(), objects.begin(), objects.end());
     }
@@ -578,10 +613,11 @@ int main(int argc, char* argv[]) {
 
     cv::Mat img = cv::imread(image_file);
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
-    for (int i = 0; i < count; i++) {
+    for (int i = 0; i < count; i++)
+    {
         objects[i] = proposals[picked[i]];
 
 #ifdef TRY_LETTER_BOX
@@ -590,7 +626,7 @@ int main(int argc, char* argv[]) {
         float y0 = (objects[i].rect.y - (pad.h / 2)) / lb_scale;
         float x1 = (objects[i].rect.x + objects[i].rect.width - (pad.w / 2)) / lb_scale;
         float y1 = (objects[i].rect.y + objects[i].rect.height - (pad.h / 2)) / lb_scale;
-#else /* !TRY_LETTER_BOX */
+#else  /* !TRY_LETTER_BOX */
         // adjust offset to original unresized
         static float lb_scale_w = 1. * lb.w / img.cols;
         static float lb_scale_h = 1. * lb.h / img.rows;
@@ -620,4 +656,3 @@ int main(int argc, char* argv[]) {
     release_tengine();
     return 0;
 }
-
diff --git a/examples/tm_openpose.cpp b/examples/tm_openpose.cpp
index a8a9b88d8..04eb4acf5 100644
--- a/examples/tm_openpose.cpp
+++ b/examples/tm_openpose.cpp
@@ -17,23 +17,19 @@
 #define DEFAULT_THREAD_COUNT 1
 
 #ifdef MPI
-const int POSE_PAIRS[14][2] = {{0, 1},  {1, 2},  {2, 3}, {3, 4},  {1, 5},   {5, 6},   {6, 7},
-                               {1, 14}, {14, 8}, {8, 9}, {9, 10}, {14, 11}, {11, 12}, {12, 13}};
+const int POSE_PAIRS[14][2] = {{0, 1}, {1, 2}, {2, 3}, {3, 4}, {1, 5}, {5, 6}, {6, 7}, {1, 14}, {14, 8}, {8, 9}, {9, 10}, {14, 11}, {11, 12}, {12, 13}};
 // std::string model_file = "models/openpose_mpi.tmfile";
 int nPoints = 15;
 #endif
 
 #ifdef COCO
-const int POSE_PAIRS[17][2] = {{1, 2},  {1, 5},   {2, 3},   {3, 4}, {5, 6},  {6, 7},   {1, 8},  {8, 9},  {9, 10},
-                               {1, 11}, {11, 12}, {12, 13}, {1, 0}, {0, 14}, {14, 16}, {0, 15}, {15, 17}};
+const int POSE_PAIRS[17][2] = {{1, 2}, {1, 5}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {1, 8}, {8, 9}, {9, 10}, {1, 11}, {11, 12}, {12, 13}, {1, 0}, {0, 14}, {14, 16}, {0, 15}, {15, 17}};
 // std::string model_file = "models/openpose_coco.tmfile";
 int nPoints = 18;
 #endif
 
 #ifdef BODY25
-const int POSE_PAIRS[24][2] = {{1, 2},   {1, 5},   {2, 3},   {3, 4},   {5, 6},   {6, 7},   {1, 8},   {8, 9},
-                               {9, 10},  {10, 11}, {11, 24}, {11, 22}, {22, 23}, {8, 12},  {12, 13}, {13, 14},
-                               {14, 21}, {14, 19}, {19, 20}, {1, 0},   {0, 15},  {16, 18}, {0, 16},  {15, 17}};
+const int POSE_PAIRS[24][2] = {{1, 2}, {1, 5}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {1, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 24}, {11, 22}, {22, 23}, {8, 12}, {12, 13}, {13, 14}, {14, 21}, {14, 19}, {19, 20}, {1, 0}, {0, 15}, {16, 18}, {0, 16}, {15, 17}};
 // std::string model_file = "models/openpose_body25.tmfile"
 int nPoints = 25;
 #endif
@@ -43,7 +39,7 @@ void get_input_data_pose(cv::Mat img, float* input_data, int img_h, int img_w)
     cv::resize(img, img, cv::Size(img_h, img_w));
     img.convertTo(img, CV_32FC3);
 
-    float* img_data = ( float* )img.data;
+    float* img_data = (float*)img.data;
     int hw = img_h * img_w;
     double scalefactor = 1.0 / 255;
     float mean[3] = {0, 0, 0};
@@ -78,19 +74,19 @@ void post_process_pose(cv::Mat img, cv::Mat frameCopy, float threshold, float* o
             if (outdata[piexl] > prob)
             {
                 prob = outdata[piexl];
-                maxloc.y = ( int )piexl / H;
-                maxloc.x = ( int )piexl % W;
+                maxloc.y = (int)piexl / H;
+                maxloc.x = (int)piexl % W;
             }
         }
         cv::Point2f p(-1, -1);
         if (prob > threshold)
         {
             p = maxloc;
-            p.y *= ( float )frameWidth / W;
-            p.x *= ( float )frameHeight / H;
+            p.y *= (float)frameWidth / W;
+            p.x *= (float)frameHeight / H;
 
-            cv::circle(frameCopy, cv::Point(( int )p.x, ( int )p.y), 4, cv::Scalar(255, 255, 0), -1);
-            cv::putText(frameCopy, cv::format("%d", n), cv::Point(( int )p.x, ( int )p.y), cv::FONT_HERSHEY_PLAIN, 2,
+            cv::circle(frameCopy, cv::Point((int)p.x, (int)p.y), 4, cv::Scalar(255, 255, 0), -1);
+            cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)p.x, (int)p.y), cv::FONT_HERSHEY_PLAIN, 2,
                         cv::Scalar(0, 255, 255), 2);
         }
         points[n] = p;
@@ -133,23 +129,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -193,9 +189,9 @@ int main(int argc, char* argv[])
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int channel = 3;
     int img_size = img_h * img_w * channel;
-    int dims[] = {1, channel, img_h, img_w};    // nchw
+    int dims[] = {1, channel, img_h, img_w}; // nchw
 
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -214,7 +210,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -258,7 +254,7 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-    float* outdata = ( float* )get_tensor_buffer(out_tensor);
+    float* outdata = (float*)get_tensor_buffer(out_tensor);
     int num = nPoints;
     int H = out_dim[2];
     int W = out_dim[3];
@@ -278,4 +274,3 @@ int main(int argc, char* argv[])
 
     return 0;
 }
-
diff --git a/examples/tm_retinaface.cpp b/examples/tm_retinaface.cpp
index 1d2346b79..0a4eccef8 100644
--- a/examples/tm_retinaface.cpp
+++ b/examples/tm_retinaface.cpp
@@ -116,7 +116,7 @@ void draw_target(const std::vector<Face2f>& all_pred_boxes, image img)
     const char* class_names[] = {"faces"};
 
     fprintf(stdout, "detected face num: %zu\n", all_pred_boxes.size());
-    for (int b = 0; b < ( int )all_pred_boxes.size(); b++)
+    for (int b = 0; b < (int)all_pred_boxes.size(); b++)
     {
         Face2f box = all_pred_boxes[b];
 
@@ -167,7 +167,7 @@ void nms_sorted_boxes(const std::vector<Face2f>& face_objects, std::vector<int>&
         const Face2f& a = face_objects[i];
 
         int keep = 1;
-        for (int j = 0; j < ( int )picked.size(); j++)
+        for (int j = 0; j < (int)picked.size(); j++)
         {
             const Face2f& b = face_objects[picked[j]];
 
@@ -228,22 +228,22 @@ std::vector<Box2f> generate_anchors(int base_size, const std::vector<float>& rat
 
     std::vector<Box2f> anchors(num_ratio * num_scale);
 
-    const float cx = ( float )base_size * 0.5f;
-    const float cy = ( float )base_size * 0.5f;
+    const float cx = (float)base_size * 0.5f;
+    const float cy = (float)base_size * 0.5f;
 
     for (int i = 0; i < num_ratio; i++)
     {
         float ar = ratios[i];
 
-        int r_w = ( int )round(( float )base_size / sqrt(ar));
-        int r_h = ( int )round(( float )r_w * ar);    // round(base_size * sqrt(ar));
+        int r_w = (int)round((float)base_size / sqrt(ar));
+        int r_h = (int)round((float)r_w * ar); // round(base_size * sqrt(ar));
 
         for (int j = 0; j < num_scale; j++)
         {
             float scale = scales[j];
 
-            float rs_w = ( float )r_w * scale;
-            float rs_h = ( float )r_h * scale;
+            float rs_w = (float)r_w * scale;
+            float rs_h = (float)r_h * scale;
 
             Box2f& anchor = anchors[i * num_scale + j];
 
@@ -337,10 +337,10 @@ static void generate_proposals(std::vector<Box2f>& anchors, int feat_stride, con
                     faces.push_back(obj);
                 }
 
-                anchor_x += ( float )feat_stride;
+                anchor_x += (float)feat_stride;
             }
 
-            anchor_y += ( float )feat_stride;
+            anchor_y += (float)feat_stride;
         }
     }
 }
@@ -360,11 +360,11 @@ int get_input_data(const char* image_file, const int& max_size, const int& targe
 
     scale = float(target_size) / float(im_size_min);
 
-    if (scale * ( float )im_size_max > ( float )max_size)
+    if (scale * (float)im_size_max > (float)max_size)
         scale = float(max_size) / float(im_size_max);
 
-    dst_size.width = ( int )round(( float )img.w * scale);
-    dst_size.height = ( int )round(( float )img.h * scale);
+    dst_size.width = (int)round((float)img.w * scale);
+    dst_size.height = (int)round((float)img.h * scale);
 
     image resImg = resize_image(img, dst_size.width, dst_size.height);
     int img_size = dst_size.height * dst_size.width * 3;
@@ -418,26 +418,26 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'n':
-                device_name = optarg;
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'n':
+            device_name = optarg;
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -464,7 +464,7 @@ int main(int argc, char* argv[])
     opt.num_thread = num_thread;
     opt.cluster = TENGINE_CLUSTER_ALL;
     opt.precision = TENGINE_MODE_FP32;
-    opt.affinity = 0;       
+    opt.affinity = 0;
 
     /* inital tengine */
     int ret = init_tengine();
@@ -518,7 +518,7 @@ int main(int argc, char* argv[])
     {
         printf("Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (0 != prerun_graph_multithread(graph, opt))
@@ -547,7 +547,7 @@ int main(int argc, char* argv[])
     }
     printf("img_h, img_w : %d, %d\n", image_size.height, image_size.width);
     printf("Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count,
-           num_thread, total_time / ( float )repeat_count, max_time, min_time);
+           num_thread, total_time / (float)repeat_count, max_time, min_time);
     printf("--------------------------------------\n");
 
     /* process the detection result */
@@ -570,9 +570,9 @@ int main(int argc, char* argv[])
         get_tensor_shape(bbox_blob_tensor, bbox_blob_dims, MAX_SHAPE_DIM_NUM);
         get_tensor_shape(landmark_blob_tensor, landmark_blob_dims, MAX_SHAPE_DIM_NUM);
 
-        float* score_blob = ( float* )get_tensor_buffer(score_blob_tensor);
-        float* bbox_blob = ( float* )get_tensor_buffer(bbox_blob_tensor);
-        float* landmark_blob = ( float* )get_tensor_buffer(landmark_blob_tensor);
+        float* score_blob = (float*)get_tensor_buffer(score_blob_tensor);
+        float* bbox_blob = (float*)get_tensor_buffer(bbox_blob_tensor);
+        float* landmark_blob = (float*)get_tensor_buffer(landmark_blob_tensor);
 
         const int base_size = 16;
         const int feat_stride = stride[stride_index];
@@ -615,10 +615,10 @@ int main(int argc, char* argv[])
         float x1 = x0 + face_objects[i].rect.w;
         float y1 = y0 + face_objects[i].rect.h;
 
-        x0 = std::max(std::min(x0, ( float )image_size.width - 1), 0.f);
-        y0 = std::max(std::min(y0, ( float )image_size.height - 1), 0.f);
-        x1 = std::max(std::min(x1, ( float )image_size.width - 1), 0.f);
-        y1 = std::max(std::min(y1, ( float )image_size.height - 1), 0.f);
+        x0 = std::max(std::min(x0, (float)image_size.width - 1), 0.f);
+        y0 = std::max(std::min(y0, (float)image_size.height - 1), 0.f);
+        x1 = std::max(std::min(x1, (float)image_size.width - 1), 0.f);
+        y1 = std::max(std::min(y1, (float)image_size.height - 1), 0.f);
 
         face_objects[i].rect.x = x0;
         face_objects[i].rect.y = y0;
diff --git a/examples/tm_ultraface.cpp b/examples/tm_ultraface.cpp
index ac426a36f..abf1f8f43 100644
--- a/examples/tm_ultraface.cpp
+++ b/examples/tm_ultraface.cpp
@@ -31,12 +31,12 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_REPEAT_COUNT    1
-#define DEFAULT_THREAD_COUNT    1
-#define num_featuremap 4
-#define hard_nms 1
-#define blending_nms 2 /* mix nms was been proposaled in paper blaze face, aims to minimize the temporal jitter*/
-#define clip(x, y) (x < 0 ? 0 : (x > y ? y : x))
+#define DEFAULT_REPEAT_COUNT 1
+#define DEFAULT_THREAD_COUNT 1
+#define num_featuremap       4
+#define hard_nms             1
+#define blending_nms         2 /* mix nms was been proposaled in paper blaze face, aims to minimize the temporal jitter*/
+#define clip(x, y)           (x < 0 ? 0 : (x > y ? y : x))
 
 typedef struct FaceInfo
 {
@@ -114,49 +114,52 @@ static void nms(std::vector<FaceInfo>& input, std::vector<FaceInfo>& output, int
         }
         switch (type)
         {
-            case hard_nms: {
-                output.push_back(buf[0]);
-                break;
-            }
-            case blending_nms: {
-                float total = 0;
-                for (int i = 0; i < buf.size(); i++)
-                {
-                    total += exp(buf[i].score);
-                }
-                FaceInfo rects;
-                memset(&rects, 0, sizeof(rects));
-                for (int i = 0; i < buf.size(); i++)
-                {
-                    float rate = exp(buf[i].score) / total;
-                    rects.x1 += buf[i].x1 * rate;
-                    rects.y1 += buf[i].y1 * rate;
-                    rects.x2 += buf[i].x2 * rate;
-                    rects.y2 += buf[i].y2 * rate;
-                    rects.score += buf[i].score * rate;
-                }
-                output.push_back(rects);
-                break;
+        case hard_nms:
+        {
+            output.push_back(buf[0]);
+            break;
+        }
+        case blending_nms:
+        {
+            float total = 0;
+            for (int i = 0; i < buf.size(); i++)
+            {
+                total += exp(buf[i].score);
             }
-            default: {
-                fprintf(stderr, "wrong type of nms.");
-                exit(-1);
+            FaceInfo rects;
+            memset(&rects, 0, sizeof(rects));
+            for (int i = 0; i < buf.size(); i++)
+            {
+                float rate = exp(buf[i].score) / total;
+                rects.x1 += buf[i].x1 * rate;
+                rects.y1 += buf[i].y1 * rate;
+                rects.x2 += buf[i].x2 * rate;
+                rects.y2 += buf[i].y2 * rate;
+                rects.score += buf[i].score * rate;
             }
+            output.push_back(rects);
+            break;
+        }
+        default:
+        {
+            fprintf(stderr, "wrong type of nms.");
+            exit(-1);
+        }
         }
     }
 }
 
-static void post_process_ultraface(const char* image_file, float *boxs_data, float *scores_data)
+static void post_process_ultraface(const char* image_file, float* boxs_data, float* scores_data)
 {
     image im = imread(image_file);
     int image_h = im.h;
     int image_w = im.w;
 
-    const std::vector<std::vector<float>> min_boxes = {
+    const std::vector<std::vector<float> > min_boxes = {
         {10.0f, 16.0f, 24.0f}, {32.0f, 48.0f}, {64.0f, 96.0f}, {128.0f, 192.0f, 256.0f}};
-    std::vector<std::vector<float>> shrinkage_size;
-    std::vector<std::vector<float>> priors = {};
-    std::vector<std::vector<float>> featuremap_size;
+    std::vector<std::vector<float> > shrinkage_size;
+    std::vector<std::vector<float> > priors = {};
+    std::vector<std::vector<float> > featuremap_size;
     const std::vector<float> strides = {8.0, 16.0, 32.0, 64.0};
     std::vector<int> w_h_list = {g_tensor_in_w, g_tensor_in_h};
 
@@ -256,23 +259,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -315,8 +318,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = g_tensor_in_h * g_tensor_in_w * 3;
-    int dims[] = {1, 3, g_tensor_in_h, g_tensor_in_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, g_tensor_in_h, g_tensor_in_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -335,7 +338,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -382,8 +385,8 @@ int main(int argc, char* argv[])
     tensor_t boxs_tensor = get_graph_output_tensor(graph, 0, 0);
     tensor_t scores_tensor = get_graph_output_tensor(graph, 1, 0);
 
-    float* boxs_data = (float* )get_tensor_buffer(boxs_tensor);
-    float* scores_data = (float* )get_tensor_buffer(scores_tensor);
+    float* boxs_data = (float*)get_tensor_buffer(boxs_tensor);
+    float* scores_data = (float*)get_tensor_buffer(scores_tensor);
 
     post_process_ultraface(image_file, boxs_data, scores_data);
 
diff --git a/examples/tm_unet.cpp b/examples/tm_unet.cpp
index 4d1929dba..3070846ae 100644
--- a/examples/tm_unet.cpp
+++ b/examples/tm_unet.cpp
@@ -35,17 +35,17 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 512 
-#define DEFAULT_IMG_W 512
-#define DEFAULT_SCALE1 (1.f/255.f)
-#define DEFAULT_SCALE2 (1.f/255.f)
-#define DEFAULT_SCALE3 (1.f/255.f)
-#define DEFAULT_MEAN1 0
-#define DEFAULT_MEAN2 0
-#define DEFAULT_MEAN3 0
-#define DEFAULT_LOOP_COUNT 1
-#define DEFAULT_THREAD_COUNT 1
-#define DEFAULT_CPU_AFFINITY 255
+#define DEFAULT_IMG_H          512
+#define DEFAULT_IMG_W          512
+#define DEFAULT_SCALE1         (1.f / 255.f)
+#define DEFAULT_SCALE2         (1.f / 255.f)
+#define DEFAULT_SCALE3         (1.f / 255.f)
+#define DEFAULT_MEAN1          0
+#define DEFAULT_MEAN2          0
+#define DEFAULT_MEAN3          0
+#define DEFAULT_LOOP_COUNT     1
+#define DEFAULT_THREAD_COUNT   1
+#define DEFAULT_CPU_AFFINITY   255
 #define DEFAULT_CONF_THRESHOLD 0.5f
 
 /**
@@ -55,22 +55,29 @@
  * because of the onnx->tmfile convertion problem, keep the network input size dividable by 16 (256,512) 
  */
 
-int draw_segmentation(const int32_t* data, int h, int w) {
-    static std::map<int32_t, cv::Vec3b> color_table = {{0, cv::Vec3b(0,0,0)},
-                                                       {1, cv::Vec3b(20,59,255)},
-                                                       {2, cv::Vec3b(120,59,200)},
-                                                       {3, cv::Vec3b(80,29,129)},
-                                                       {4, cv::Vec3b(210,99,12)}, // add more color if needed
-                                                       {-1, cv::Vec3b(255,255,255)} // other type
-                                                       };
+int draw_segmentation(const int32_t* data, int h, int w)
+{
+    static std::map<int32_t, cv::Vec3b> color_table = {
+        {0, cv::Vec3b(0, 0, 0)},
+        {1, cv::Vec3b(20, 59, 255)},
+        {2, cv::Vec3b(120, 59, 200)},
+        {3, cv::Vec3b(80, 29, 129)},
+        {4, cv::Vec3b(210, 99, 12)},   // add more color if needed
+        {-1, cv::Vec3b(255, 255, 255)} // other type
+    };
     cv::Mat img = cv::Mat::zeros(h, w, CV_8UC3);
-    for (int i = 0; i < h; ++i) {
-        for (int j = 0; j < w; ++j) {
+    for (int i = 0; i < h; ++i)
+    {
+        for (int j = 0; j < w; ++j)
+        {
             cv::Vec3b color;
             int32_t value = data[i * w + j];
-            if (color_table.count(value) > 0) {
+            if (color_table.count(value) > 0)
+            {
                 color = color_table.at(value);
-            } else {
+            }
+            else
+            {
                 color = color_table.at(-1);
             }
             img.at<cv::Vec3b>(i, j) = color;
@@ -81,7 +88,7 @@ int draw_segmentation(const int32_t* data, int h, int w) {
 }
 
 int tengine_segment(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean,
-                     const float* scale, int loop_count, int num_thread, int affinity, float conf_thresh)
+                    const float* scale, int loop_count, int num_thread, int affinity, float conf_thresh)
 {
     /* set runtime options */
     struct options opt;
@@ -108,8 +115,8 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -128,7 +135,7 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -170,45 +177,56 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     int channel = output_size / img_h / img_w;
     int res = output_size % (img_h * img_w);
-    if (res != 0) {
-      fprintf(stderr, "output shape is not supported.\n");
-    } else {
-      int* label_data = new int[img_h * img_w];
-      /* single class segmentation */
-      if (channel == 1) {
-        for (int i=0; i < img_h; ++i) {
-          for (int j=0; j < img_w; ++j) {
-              float conf = 1/(1+std::exp(-output_data[i*img_w + j]));
-              label_data[i*img_w + j] = conf > conf_thresh ? 1 : 0;
-          }
+    if (res != 0)
+    {
+        fprintf(stderr, "output shape is not supported.\n");
+    }
+    else
+    {
+        int* label_data = new int[img_h * img_w];
+        /* single class segmentation */
+        if (channel == 1)
+        {
+            for (int i = 0; i < img_h; ++i)
+            {
+                for (int j = 0; j < img_w; ++j)
+                {
+                    float conf = 1 / (1 + std::exp(-output_data[i * img_w + j]));
+                    label_data[i * img_w + j] = conf > conf_thresh ? 1 : 0;
+                }
+            }
         }
-      }
-      /* multi-class segmentation */
-      else {
-        for (int i=0; i < img_h; ++i) {
-          for (int j=0; j < img_w; ++j) {
-              int argmax_id = -1;
-              float max_conf = std::numeric_limits<float>::min();
-              for (int k=0; k < channel; ++k) {
-                  float out_value = output_data[k * img_w * img_h + i * img_w + j];
-                  if (out_value > max_conf) {
-                      argmax_id = k;
-                      max_conf = out_value;
-                  }
-              }
-              label_data[i*img_w + j] = argmax_id;
-          }
+        /* multi-class segmentation */
+        else
+        {
+            for (int i = 0; i < img_h; ++i)
+            {
+                for (int j = 0; j < img_w; ++j)
+                {
+                    int argmax_id = -1;
+                    float max_conf = std::numeric_limits<float>::min();
+                    for (int k = 0; k < channel; ++k)
+                    {
+                        float out_value = output_data[k * img_w * img_h + i * img_w + j];
+                        if (out_value > max_conf)
+                        {
+                            argmax_id = k;
+                            max_conf = out_value;
+                        }
+                    }
+                    label_data[i * img_w + j] = argmax_id;
+                }
+            }
         }
-      }
-      /* visualization */
-      draw_segmentation(label_data, img_h, img_w);
-      fprintf(stderr, "segmentatation result is save as unet_out.png\n");
-      delete[] label_data;
+        /* visualization */
+        draw_segmentation(label_data, img_h, img_w);
+        fprintf(stderr, "segmentatation result is save as unet_out.png\n");
+        delete[] label_data;
     }
 
     /* release tengine */
@@ -246,40 +264,40 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'c':
-                conf_thresh = atof(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'c':
+            conf_thresh = atof(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_yolact.cpp b/examples/tm_yolact.cpp
index 1d2e2d499..8e0aaa5e4 100644
--- a/examples/tm_yolact.cpp
+++ b/examples/tm_yolact.cpp
@@ -86,7 +86,7 @@ void get_input_data_cv(const cv::Mat& sample, float* input_data, int img_h, int
 
     cv::resize(img, img, cv::Size(img_h, img_w));
     img.convertTo(img, CV_32FC3);
-    float* img_data = ( float* )img.data;
+    float* img_data = (float*)img.data;
     int hw = img_h * img_w;
     for (int h = 0; h < img_h; h++)
     {
@@ -166,10 +166,10 @@ static inline float intersection_area(const Object& a, const Object& b)
     return inter.area();
 }
 
-static void fast_nms(std::vector<std::vector<Object>>& class_candidates, std::vector<Object>& objects,
+static void fast_nms(std::vector<std::vector<Object> >& class_candidates, std::vector<Object>& objects,
                      const float iou_thresh, const int nms_top_k, const int keep_top_k)
 {
-    for (int i = 0; i < ( int )class_candidates.size(); i++)
+    for (int i = 0; i < (int)class_candidates.size(); i++)
     {
         std::vector<Object>& candidate = class_candidates[i];
         std::sort(candidate.begin(), candidate.end(), [](const Object& a, const Object& b) { return a.prob > b.prob; });
@@ -189,7 +189,7 @@ static void fast_nms(std::vector<std::vector<Object>>& class_candidates, std::ve
         {
             areas[j] = candidate[j].rect.area();
         }
-        std::vector<std::vector<float>> iou_matrix;
+        std::vector<std::vector<float> > iou_matrix;
         for (int j = 0; j < n; j++)
         {
             std::vector<float> iou_row(n);
@@ -252,7 +252,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = target_size * target_size * 3;
-    int dims[] = {1, 3, target_size, target_size};    // nchw
+    int dims[] = {1, 3, target_size, target_size}; // nchw
     std::vector<float> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
@@ -272,7 +272,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -311,10 +311,10 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
     tensor_t location_tensor = get_graph_output_tensor(graph, 2, 0);
     tensor_t mask_tensor = get_graph_output_tensor(graph, 3, 0);
     tensor_t confidence_tensor = get_graph_output_tensor(graph, 4, 0);
-    float* maskmaps = ( float* )get_tensor_buffer(maskmaps_tensor);
-    float* location = ( float* )get_tensor_buffer(location_tensor);
-    float* mask = ( float* )get_tensor_buffer(mask_tensor);
-    float* confidence = ( float* )get_tensor_buffer(confidence_tensor);
+    float* maskmaps = (float*)get_tensor_buffer(maskmaps_tensor);
+    float* location = (float*)get_tensor_buffer(location_tensor);
+    float* mask = (float*)get_tensor_buffer(mask_tensor);
+    float* confidence = (float*)get_tensor_buffer(confidence_tensor);
 
     int num_class = 81;
     int num_priors = 19248;
@@ -323,7 +323,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
     const float nms_thresh = 0.5f;
     const int keep_top_k = 200;
 
-    std::vector<std::vector<Object>> class_candidates;
+    std::vector<std::vector<Object> > class_candidates;
     class_candidates.resize(num_class);
 
     for (int i = 0; i < num_priors; i++)
@@ -352,18 +352,18 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
 
         float bbox_cx = var[0] * loc[0] * priorbox.w + priorbox.cx;
         float bbox_cy = var[1] * loc[1] * priorbox.h + priorbox.cy;
-        float bbox_w = ( float )(exp(var[2] * loc[2]) * priorbox.w);
-        float bbox_h = ( float )(exp(var[3] * loc[3]) * priorbox.h);
+        float bbox_w = (float)(exp(var[2] * loc[2]) * priorbox.w);
+        float bbox_h = (float)(exp(var[3] * loc[3]) * priorbox.h);
 
         float obj_x1 = bbox_cx - bbox_w * 0.5f;
         float obj_y1 = bbox_cy - bbox_h * 0.5f;
         float obj_x2 = bbox_cx + bbox_w * 0.5f;
         float obj_y2 = bbox_cy + bbox_h * 0.5f;
 
-        obj_x1 = std::max(std::min(obj_x1 * bgr.cols, ( float )(bgr.cols - 1)), 0.f);
-        obj_y1 = std::max(std::min(obj_y1 * bgr.rows, ( float )(bgr.rows - 1)), 0.f);
-        obj_x2 = std::max(std::min(obj_x2 * bgr.cols, ( float )(bgr.cols - 1)), 0.f);
-        obj_y2 = std::max(std::min(obj_y2 * bgr.rows, ( float )(bgr.rows - 1)), 0.f);
+        obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
+        obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
 
         Object obj;
         obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
@@ -390,7 +390,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
             {
                 const float* maskmap = maskmaps + p;
                 float coeff = obj.maskdata[p];
-                float* mp = ( float* )mask1.data;
+                float* mp = (float*)mask1.data;
 
                 // mask += m * coeff
                 for (int j = 0; j < 138 * 138; j++)
@@ -534,8 +534,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         {0, 255, 75},
         {0, 255, 151},
         {255, 56, 0},
-        {245, 255, 0}
-    };
+        {245, 255, 0}};
 
     cv::Mat image = bgr.clone();
 
@@ -613,23 +612,23 @@ int main(int argc, char** argv)
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_yolact_uint8.cpp b/examples/tm_yolact_uint8.cpp
index e344d39c2..642b5acf8 100644
--- a/examples/tm_yolact_uint8.cpp
+++ b/examples/tm_yolact_uint8.cpp
@@ -61,8 +61,8 @@ struct Object
     cv::Mat mask;
 };
 
-void get_input_data_cv_uint8(const cv::Mat& sample, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale, 
-                       float input_scale, int zero_point)
+void get_input_data_cv_uint8(const cv::Mat& sample, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale,
+                             float input_scale, int zero_point)
 {
     cv::Mat img;
     if (sample.channels() == 4)
@@ -84,20 +84,21 @@ void get_input_data_cv_uint8(const cv::Mat& sample, uint8_t* input_data, int img
 
     cv::resize(img, img, cv::Size(img_h, img_w));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 float input_fp32 = (img_data[in_index] - mean[c]) * scale[c];
 
                 /* quant to uint8 */
-                int udata = (round)(input_fp32 / input_scale + ( float )zero_point);
+                int udata = (round)(input_fp32 / input_scale + (float)zero_point);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -174,10 +175,10 @@ static inline float intersection_area(const Object& a, const Object& b)
     return inter.area();
 }
 
-static void fast_nms(std::vector<std::vector<Object>>& class_candidates, std::vector<Object>& objects,
+static void fast_nms(std::vector<std::vector<Object> >& class_candidates, std::vector<Object>& objects,
                      const float iou_thresh, const int nms_top_k, const int keep_top_k)
 {
-    for (int i = 0; i < ( int )class_candidates.size(); i++)
+    for (int i = 0; i < (int)class_candidates.size(); i++)
     {
         std::vector<Object>& candidate = class_candidates[i];
         std::sort(candidate.begin(), candidate.end(), [](const Object& a, const Object& b) { return a.prob > b.prob; });
@@ -197,7 +198,7 @@ static void fast_nms(std::vector<std::vector<Object>>& class_candidates, std::ve
         {
             areas[j] = candidate[j].rect.area();
         }
-        std::vector<std::vector<float>> iou_matrix;
+        std::vector<std::vector<float> > iou_matrix;
         for (int j = 0; j < n; j++)
         {
             std::vector<float> iou_row(n);
@@ -260,7 +261,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = target_size * target_size * 3;
-    int dims[] = {1, 3, target_size, target_size};    // nchw
+    int dims[] = {1, 3, target_size, target_size}; // nchw
     std::vector<uint8_t> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
@@ -280,7 +281,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -292,7 +293,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
     /* prepare process input data, set the data mem to input tensor */
     float input_scale = 0.f;
     int input_zero_point = 0;
-    get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);    
+    get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
     get_input_data_cv_uint8(bgr, input_data.data(), target_size, target_size, mean_vals, norm_vals, input_scale, input_zero_point);
 
     /* run graph */
@@ -318,19 +319,19 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
     fprintf(stderr, "--------------------------------------\n");
 
     /* dequant output data */
-    tensor_t maskmaps_tensor   = get_graph_output_tensor(graph, 1, 0);
-    tensor_t location_tensor   = get_graph_output_tensor(graph, 2, 0);
-    tensor_t mask_tensor       = get_graph_output_tensor(graph, 3, 0);
+    tensor_t maskmaps_tensor = get_graph_output_tensor(graph, 1, 0);
+    tensor_t location_tensor = get_graph_output_tensor(graph, 2, 0);
+    tensor_t mask_tensor = get_graph_output_tensor(graph, 3, 0);
     tensor_t confidence_tensor = get_graph_output_tensor(graph, 4, 0);
 
     float maskmaps_scale = 0.f;
     float location_scale = 0.f;
-    float mask_scale     = 0.f;
+    float mask_scale = 0.f;
     float confidence_scale = 0.f;
 
     int maskmaps_zero_point = 0;
     int location_zero_point = 0;
-    int mask_zero_point     = 0;
+    int mask_zero_point = 0;
     int confidence_zero_point = 0;
 
     get_tensor_quant_param(maskmaps_tensor, &maskmaps_scale, &maskmaps_zero_point, 1);
@@ -338,15 +339,15 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
     get_tensor_quant_param(mask_tensor, &mask_scale, &mask_zero_point, 1);
     get_tensor_quant_param(confidence_tensor, &confidence_scale, &confidence_zero_point, 1);
 
-    int maskmaps_count   = get_tensor_buffer_size(maskmaps_tensor) / sizeof(uint8_t);
-    int location_count   = get_tensor_buffer_size(location_tensor) / sizeof(uint8_t);
-    int mask_count       = get_tensor_buffer_size(mask_tensor) / sizeof(uint8_t);
+    int maskmaps_count = get_tensor_buffer_size(maskmaps_tensor) / sizeof(uint8_t);
+    int location_count = get_tensor_buffer_size(location_tensor) / sizeof(uint8_t);
+    int mask_count = get_tensor_buffer_size(mask_tensor) / sizeof(uint8_t);
     int confidence_count = get_tensor_buffer_size(confidence_tensor) / sizeof(uint8_t);
 
-    uint8_t* maskmaps_u8   = ( uint8_t* )get_tensor_buffer(maskmaps_tensor);
-    uint8_t* location_u8   = ( uint8_t* )get_tensor_buffer(location_tensor);
-    uint8_t* mask_u8       = ( uint8_t* )get_tensor_buffer(mask_tensor);
-    uint8_t* confidence_u8 = ( uint8_t* )get_tensor_buffer(confidence_tensor);
+    uint8_t* maskmaps_u8 = (uint8_t*)get_tensor_buffer(maskmaps_tensor);
+    uint8_t* location_u8 = (uint8_t*)get_tensor_buffer(location_tensor);
+    uint8_t* mask_u8 = (uint8_t*)get_tensor_buffer(mask_tensor);
+    uint8_t* confidence_u8 = (uint8_t*)get_tensor_buffer(confidence_tensor);
 
     std::vector<float> maskmaps(maskmaps_count);
     std::vector<float> location(location_count);
@@ -355,23 +356,23 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
 
     for (int c = 0; c < maskmaps_count; c++)
     {
-        maskmaps[c] = (( float )maskmaps_u8[c] - ( float )maskmaps_zero_point) * maskmaps_scale;
+        maskmaps[c] = ((float)maskmaps_u8[c] - (float)maskmaps_zero_point) * maskmaps_scale;
     }
 
     for (int c = 0; c < location_count; c++)
     {
-        location[c] = (( float )location_u8[c] - ( float )location_zero_point) * location_scale;
+        location[c] = ((float)location_u8[c] - (float)location_zero_point) * location_scale;
     }
 
     for (int c = 0; c < mask_count; c++)
     {
-        mask[c] = (( float )mask_u8[c] - ( float )mask_zero_point) * mask_scale;
+        mask[c] = ((float)mask_u8[c] - (float)mask_zero_point) * mask_scale;
     }
 
     for (int c = 0; c < confidence_count; c++)
     {
-        confidence[c] = (( float )confidence_u8[c] - ( float )confidence_zero_point) * confidence_scale;
-    }    
+        confidence[c] = ((float)confidence_u8[c] - (float)confidence_zero_point) * confidence_scale;
+    }
 
     /* postprocess */
     int num_class = 81;
@@ -381,7 +382,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
     const float nms_thresh = 0.5f;
     const int keep_top_k = 200;
 
-    std::vector<std::vector<Object>> class_candidates;
+    std::vector<std::vector<Object> > class_candidates;
     class_candidates.resize(num_class);
 
     for (int i = 0; i < num_priors; i++)
@@ -410,18 +411,18 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
 
         float bbox_cx = var[0] * loc[0] * priorbox.w + priorbox.cx;
         float bbox_cy = var[1] * loc[1] * priorbox.h + priorbox.cy;
-        float bbox_w = ( float )(exp(var[2] * loc[2]) * priorbox.w);
-        float bbox_h = ( float )(exp(var[3] * loc[3]) * priorbox.h);
+        float bbox_w = (float)(exp(var[2] * loc[2]) * priorbox.w);
+        float bbox_h = (float)(exp(var[3] * loc[3]) * priorbox.h);
 
         float obj_x1 = bbox_cx - bbox_w * 0.5f;
         float obj_y1 = bbox_cy - bbox_h * 0.5f;
         float obj_x2 = bbox_cx + bbox_w * 0.5f;
         float obj_y2 = bbox_cy + bbox_h * 0.5f;
 
-        obj_x1 = std::max(std::min(obj_x1 * bgr.cols, ( float )(bgr.cols - 1)), 0.f);
-        obj_y1 = std::max(std::min(obj_y1 * bgr.rows, ( float )(bgr.rows - 1)), 0.f);
-        obj_x2 = std::max(std::min(obj_x2 * bgr.cols, ( float )(bgr.cols - 1)), 0.f);
-        obj_y2 = std::max(std::min(obj_y2 * bgr.rows, ( float )(bgr.rows - 1)), 0.f);
+        obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
+        obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
 
         Object obj;
         obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
@@ -448,7 +449,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
             {
                 const float* maskmap = maskmaps.data() + p;
                 float coeff = obj.maskdata[p];
-                float* mp = ( float* )mask1.data;
+                float* mp = (float*)mask1.data;
 
                 // mask += m * coeff
                 for (int j = 0; j < 138 * 138; j++)
@@ -592,8 +593,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         {0, 255, 75},
         {0, 255, 151},
         {255, 56, 0},
-        {245, 255, 0}
-    };
+        {245, 255, 0}};
 
     cv::Mat image = bgr.clone();
 
@@ -671,23 +671,23 @@ int main(int argc, char** argv)
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/examples/tm_yolofastest.cpp b/examples/tm_yolofastest.cpp
index acaa42cdc..69824e9c6 100644
--- a/examples/tm_yolofastest.cpp
+++ b/examples/tm_yolofastest.cpp
@@ -23,7 +23,7 @@
  * 
  * original model: https://github.com/dog-qiuqiu/Yolo-Fastest/tree/master/ModelZoo/yolo-fastest-1.1_coco
  */
- 
+
 #include <iostream>
 #include <iomanip>
 #include <vector>
@@ -70,20 +70,20 @@ struct TMat
         return (const float*)data;
     }
 
-    float *row(int row) const
+    float* row(int row) const
     {
-        return (float *)data + w * row;
+        return (float*)data + w * row;
     }
 
-    TMat channel_range(int start, int chn_num) const 
+    TMat channel_range(int start, int chn_num) const
     {
-        TMat mat = { 0 };
+        TMat mat = {0};
 
         mat.batch = 1;
         mat.c = chn_num;
         mat.h = h;
         mat.w = w;
-        mat.data = (float *)data + start * h * w;
+        mat.data = (float*)data + start * h * w;
 
         return mat;
     }
@@ -94,7 +94,7 @@ struct TMat
     }
 
     int batch, c, h, w;
-    void *data;
+    void* data;
 };
 
 class Yolov3DetectionOutput
@@ -102,8 +102,8 @@ class Yolov3DetectionOutput
 public:
     int init(int version);
     int forward(const std::vector<TMat>& bottom_blobs, std::vector<TMat>& top_blobs);
-private:
 
+private:
     int m_num_box;
     int m_num_class;
     float m_anchors_scale[32];
@@ -122,24 +122,23 @@ static const char* class_names[] = {
     "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
     "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
     "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-    "hair drier", "toothbrush"
-};
+    "hair drier", "toothbrush"};
 
 int Yolov3DetectionOutput::init(int version)
 {
     memset(this, 0, sizeof(*this));
     m_num_box = 3;
     m_num_class = 80;
-	
-	fprintf(stderr, "Yolov3DetectionOutput init param[%d]\n", version);
-	
+
+    fprintf(stderr, "Yolov3DetectionOutput init param[%d]\n", version);
+
     if (version == YOLOV3)
     {
         m_anchors_scale[0] = 32;
         m_anchors_scale[1] = 16;
         m_anchors_scale[2] = 8;
 
-        float bias[] = { 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326 };
+        float bias[] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326};
         memcpy(m_biases, bias, sizeof(bias));
 
         m_mask[0] = 6;
@@ -159,7 +158,7 @@ int Yolov3DetectionOutput::init(int version)
         m_anchors_scale[0] = 32;
         m_anchors_scale[1] = 16;
 
-        float bias[] = { 12, 18,  37, 49,  52,132, 115, 73, 119,199, 242,238 };
+        float bias[] = {12, 18, 37, 49, 52, 132, 115, 73, 119, 199, 242, 238};
         memcpy(m_biases, bias, sizeof(bias));
 
         m_mask[0] = 3;
@@ -345,7 +344,7 @@ int Yolov3DetectionOutput::forward(const std::vector<TMat>& bottom_blobs, std::v
 
                         float area = bbox_w * bbox_h;
 
-                        BBoxRect c = { confidence, bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax, area, class_index };
+                        BBoxRect c = {confidence, bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax, area, class_index};
                         all_box_bbox_rects[pp].push_back(c);
                     }
 
@@ -410,13 +409,13 @@ int Yolov3DetectionOutput::forward(const std::vector<TMat>& bottom_blobs, std::v
 
 static void get_input_data_darknet(const char* image_file, float* input_data, int net_h, int net_w)
 {
-    float mean[3] = { 0.f, 0.f, 0.f };
-    float scale[3] = { 1.0f / 255, 1.0f / 255, 1.0f / 255 };
+    float mean[3] = {0.f, 0.f, 0.f};
+    float scale[3] = {1.0f / 255, 1.0f / 255, 1.0f / 255};
 
     //no letter box by default
     get_input_data(image_file, input_data, net_h, net_w, mean, scale);
     // input rgb
-    image swaprgb_img = { 0 };
+    image swaprgb_img = {0};
     swaprgb_img.c = 3;
     swaprgb_img.w = net_w;
     swaprgb_img.h = net_h;
@@ -429,11 +428,11 @@ static void show_usage()
     fprintf(stderr, "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
 }
 
-static void run_yolo(graph_t graph, std::vector<BBoxRect> &boxes, int img_width, int img_height)
+static void run_yolo(graph_t graph, std::vector<BBoxRect>& boxes, int img_width, int img_height)
 {
     Yolov3DetectionOutput yolo;
     std::vector<TMat> yolo_inputs, yolo_outputs;
-    
+
     yolo.init(YOLO_FASTEST);
 
     int output_node_num = get_graph_output_node_number(graph);
@@ -442,8 +441,8 @@ static void run_yolo(graph_t graph, std::vector<BBoxRect> &boxes, int img_width,
 
     for (int i = 0; i < output_node_num; ++i)
     {
-        tensor_t out_tensor = get_graph_output_tensor(graph, i, 0);    //"detection_out"
-        int out_dim[4] = { 0 };
+        tensor_t out_tensor = get_graph_output_tensor(graph, i, 0); //"detection_out"
+        int out_dim[4] = {0};
         get_tensor_shape(out_tensor, out_dim, 4);
 
         yolo_inputs[i].batch = out_dim[0];
@@ -485,9 +484,9 @@ static void run_yolo(graph_t graph, std::vector<BBoxRect> &boxes, int img_width,
     //rect correct
     for (int i = 0; i < yolo_outputs[0].h; i++)
     {
-        float *data_row = yolo_outputs[0].row(i);
+        float* data_row = yolo_outputs[0].row(i);
 
-        BBoxRect box = { 0 };
+        BBoxRect box = {0};
         box.score = data_row[1];
         box.label = data_row[0];
         box.xmin = (data_row[2] - roi_left) / roi_width * img_width;
@@ -584,7 +583,7 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = net_h * net_w * 3;
-    int dims[] = { 1, 3, net_h, net_w };    // nchw
+    int dims[] = {1, 3, net_h, net_w}; // nchw
 
     std::vector<float> input_data(img_size);
 
@@ -636,7 +635,7 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count,
-        num_thread, total_time / repeat_count, max_time, min_time);
+            num_thread, total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* process the detection result */
diff --git a/examples/tm_yolov3.cpp b/examples/tm_yolov3.cpp
index 5127480c0..d2e5e7bad 100644
--- a/examples/tm_yolov3.cpp
+++ b/examples/tm_yolov3.cpp
@@ -146,15 +146,16 @@ void get_input_data_yolov3(const char* image_file, float* input_data, int img_h,
     /* resize process */
     cv::resize(img, img, cv::Size(img_w, img_h));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c];
             }
@@ -171,11 +172,11 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
     int feat_h = 416.0 / stride;
     int cls_num = 80;
     int anchor_group = 0;
-    if(stride == 8)
+    if (stride == 8)
         anchor_group = 1;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 2;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 3;
     //printf("anchor_group:%d\n",anchor_group);
     for (int h = 0; h <= feat_h - 1; h++)
@@ -191,7 +192,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                 {
                     int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size;
                     float score = feat[score_index];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -199,7 +200,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                 }
                 float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size];
                 float final_score = sigmoid(box_score) * sigmoid(class_score);
-                if(final_score >= prob_threshold)
+                if (final_score >= prob_threshold)
                 {
                     int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size;
                     int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size;
@@ -219,7 +220,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                     float pred_y = (h + dy) * stride;
                     float pred_w = exp(dw) * anchor_w;
                     float pred_h = exp(dh) * anchor_h;
-                    
+
                     float x0 = (pred_x - pred_w * 0.5f);
                     float y0 = (pred_y - pred_h * 0.5f);
                     float x1 = (pred_x + pred_w * 0.5f);
@@ -232,7 +233,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                     obj.rect.height = y1 - y0;
                     obj.label = class_index;
                     obj.prob = final_score;
-                    objects.push_back(obj); 
+                    objects.push_back(obj);
                 }
             }
         }
@@ -250,8 +251,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    };
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -312,23 +312,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -355,7 +355,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "cv::imread %s failed\n", image_file);
         return -1;
-    }    
+    }
 
     /* set runtime options */
     struct options opt;
@@ -432,16 +432,16 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
-    tensor_t p8_output  = get_graph_output_tensor(graph, 2, 0);
+    tensor_t p8_output = get_graph_output_tensor(graph, 2, 0);
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 0, 0);
-    
-    float* p8_data  = ( float*)get_tensor_buffer(p8_output);
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
+
+    float* p8_data = (float*)get_tensor_buffer(p8_output);
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
 
     /* postprocess */
     const float prob_threshold = 0.4f;
@@ -473,7 +473,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)raw_h / img_h;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolov3_tiny.cpp b/examples/tm_yolov3_tiny.cpp
index d77a3d823..3516e2dea 100644
--- a/examples/tm_yolov3_tiny.cpp
+++ b/examples/tm_yolov3_tiny.cpp
@@ -146,15 +146,16 @@ void get_input_data_yolov3(const char* image_file, float* input_data, int img_h,
     /* resize process */
     cv::resize(img, img, cv::Size(img_w, img_h));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c];
             }
@@ -172,9 +173,9 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
     int cls_num = 80;
     int anchor_group = 0;
 
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 1;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 2;
     //printf("anchor_group:%d\n",anchor_group);
     for (int h = 0; h <= feat_h - 1; h++)
@@ -190,7 +191,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                 {
                     int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size;
                     float score = feat[score_index];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -198,7 +199,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                 }
                 float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size];
                 float final_score = sigmoid(box_score) * sigmoid(class_score);
-                if(final_score >= prob_threshold)
+                if (final_score >= prob_threshold)
                 {
                     int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size;
                     int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size;
@@ -218,7 +219,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                     float pred_y = (h + dy) * stride;
                     float pred_w = exp(dw) * anchor_w;
                     float pred_h = exp(dh) * anchor_h;
-                    
+
                     float x0 = (pred_x - pred_w * 0.5f);
                     float y0 = (pred_y - pred_h * 0.5f);
                     float x1 = (pred_x + pred_w * 0.5f);
@@ -231,7 +232,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                     obj.rect.height = y1 - y0;
                     obj.label = class_index;
                     obj.prob = final_score;
-                    objects.push_back(obj); 
+                    objects.push_back(obj);
                 }
             }
         }
@@ -249,8 +250,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    };
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -311,23 +311,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -354,7 +354,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "cv::imread %s failed\n", image_file);
         return -1;
-    }    
+    }
 
     /* set runtime options */
     struct options opt;
@@ -381,7 +381,7 @@ int main(int argc, char* argv[])
 
     int img_size = img_h * img_w * img_c;
     int dims[] = {1, 3, img_h, img_w};
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -431,14 +431,14 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 0, 0);
-    
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
+
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
 
     /* postprocess */
     const float prob_threshold = 0.4f;
@@ -468,7 +468,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)raw_h / img_h;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolov3_tiny_uint8.cpp b/examples/tm_yolov3_tiny_uint8.cpp
index ed80e3b6e..54d701ee9 100644
--- a/examples/tm_yolov3_tiny_uint8.cpp
+++ b/examples/tm_yolov3_tiny_uint8.cpp
@@ -133,7 +133,7 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
     }
 }
 
-void get_input_data_yolov3_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale,
+void get_input_data_yolov3_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale,
                                  float input_scale, int zero_point)
 {
     cv::Mat sample = cv::imread(image_file, 1);
@@ -147,20 +147,21 @@ void get_input_data_yolov3_uint8(const char* image_file, uint8_t * input_data, i
     /* resize process */
     cv::resize(img, img, cv::Size(img_w, img_h));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 float input_fp32 = (img_data[in_index] - mean[c]) * scale[c];
 
                 /* quant to uint8 */
-                int udata = (round)(input_fp32 / input_scale + ( float )zero_point);
+                int udata = (round)(input_fp32 / input_scale + (float)zero_point);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -182,9 +183,9 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
     int cls_num = 80;
     int anchor_group = 0;
 
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 1;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 2;
     //printf("anchor_group:%d\n",anchor_group);
     for (int h = 0; h <= feat_h - 1; h++)
@@ -200,7 +201,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                 {
                     int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size;
                     float score = feat[score_index];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -208,7 +209,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                 }
                 float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size];
                 float final_score = sigmoid(box_score) * sigmoid(class_score);
-                if(final_score >= prob_threshold)
+                if (final_score >= prob_threshold)
                 {
                     int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size;
                     int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size;
@@ -228,7 +229,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                     float pred_y = (h + dy) * stride;
                     float pred_w = exp(dw) * anchor_w;
                     float pred_h = exp(dh) * anchor_h;
-	           
+
                     float x0 = (pred_x - pred_w * 0.5f);
                     float y0 = (pred_y - pred_h * 0.5f);
                     float x1 = (pred_x + pred_w * 0.5f);
@@ -241,7 +242,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                     obj.rect.height = y1 - y0;
                     obj.label = class_index;
                     obj.prob = final_score;
-                    objects.push_back(obj); 
+                    objects.push_back(obj);
                 }
             }
         }
@@ -259,8 +260,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    };
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -321,23 +321,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -364,7 +364,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "cv::imread %s failed\n", image_file);
         return -1;
-    }    
+    }
 
     /* set runtime options */
     struct options opt;
@@ -444,7 +444,7 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* dequant output data */
@@ -462,21 +462,21 @@ int main(int argc, char* argv[])
     int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t);
     int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t);
 
-    uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output);
-    uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output);
+    uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output);
+    uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output);
 
     std::vector<float> p16_data(p16_count);
     std::vector<float> p32_data(p32_count);
 
     for (int c = 0; c < p16_count; c++)
     {
-        p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale;
+        p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale;
     }
 
     for (int c = 0; c < p32_count; c++)
     {
-        p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale;
-    }    
+        p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale;
+    }
 
     /* postprocess */
     const float prob_threshold = 0.4f;
@@ -505,7 +505,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)raw_h / img_h;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolov3_uint8.cpp b/examples/tm_yolov3_uint8.cpp
index 35bfca921..93d509ab1 100644
--- a/examples/tm_yolov3_uint8.cpp
+++ b/examples/tm_yolov3_uint8.cpp
@@ -133,7 +133,7 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
     }
 }
 
-void get_input_data_yolov3_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale,
+void get_input_data_yolov3_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale,
                                  float input_scale, int zero_point)
 {
     cv::Mat sample = cv::imread(image_file, 1);
@@ -147,20 +147,21 @@ void get_input_data_yolov3_uint8(const char* image_file, uint8_t * input_data, i
     /* resize process */
     cv::resize(img, img, cv::Size(img_w, img_h));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 float input_fp32 = (img_data[in_index] - mean[c]) * scale[c];
 
                 /* quant to uint8 */
-                int udata = (round)(input_fp32 / input_scale + ( float )zero_point);
+                int udata = (round)(input_fp32 / input_scale + (float)zero_point);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -181,11 +182,11 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
     int feat_h = 416.0 / stride;
     int cls_num = 80;
     int anchor_group = 0;
-    if(stride == 8)
+    if (stride == 8)
         anchor_group = 1;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 2;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 3;
     //printf("anchor_group:%d\n",anchor_group);
     for (int h = 0; h <= feat_h - 1; h++)
@@ -201,7 +202,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                 {
                     int score_index = anchor * 85 * channel_size + feat_w * h + w + (s + 5) * channel_size;
                     float score = feat[score_index];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -209,7 +210,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                 }
                 float box_score = feat[anchor * 85 * channel_size + feat_w * h + w + 4 * channel_size];
                 float final_score = sigmoid(box_score) * sigmoid(class_score);
-                if(final_score >= prob_threshold)
+                if (final_score >= prob_threshold)
                 {
                     int dx_index = anchor * 85 * channel_size + feat_w * h + w + 0 * channel_size;
                     int dy_index = anchor * 85 * channel_size + feat_w * h + w + 1 * channel_size;
@@ -217,7 +218,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                     int dh_index = anchor * 85 * channel_size + feat_w * h + w + 3 * channel_size;
 
                     float dx = sigmoid(feat[dx_index]);
-                    
+
                     float dy = sigmoid(feat[dy_index]);
 
                     float dw = feat[dw_index];
@@ -230,8 +231,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                     float pred_y = (h + dy) * stride;
                     float pred_w = exp(dw) * anchor_w;
                     float pred_h = exp(dh) * anchor_h;
-                    
-	           
+
                     float x0 = (pred_x - pred_w * 0.5f);
                     float y0 = (pred_y - pred_h * 0.5f);
                     float x1 = (pred_x + pred_w * 0.5f);
@@ -244,7 +244,7 @@ static void generate_proposals(int stride, const float* feat, float prob_thresho
                     obj.rect.height = y1 - y0;
                     obj.label = class_index;
                     obj.prob = final_score;
-                    objects.push_back(obj); 
+                    objects.push_back(obj);
                 }
             }
         }
@@ -262,8 +262,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    };
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -324,23 +323,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -367,7 +366,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "cv::imread %s failed\n", image_file);
         return -1;
-    }    
+    }
 
     /* set runtime options */
     struct options opt;
@@ -447,18 +446,18 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* dequant output data */
-    tensor_t p8_output  = get_graph_output_tensor(graph, 2, 0);
+    tensor_t p8_output = get_graph_output_tensor(graph, 2, 0);
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 0, 0);
 
-    float p8_scale  = 0.f;
+    float p8_scale = 0.f;
     float p16_scale = 0.f;
     float p32_scale = 0.f;
-    int p8_zero_point  = 0;
+    int p8_zero_point = 0;
     int p16_zero_point = 0;
     int p32_zero_point = 0;
 
@@ -466,13 +465,13 @@ int main(int argc, char* argv[])
     get_tensor_quant_param(p16_output, &p16_scale, &p16_zero_point, 1);
     get_tensor_quant_param(p32_output, &p32_scale, &p32_zero_point, 1);
 
-    int p8_count  = get_tensor_buffer_size(p8_output) / sizeof(uint8_t);
+    int p8_count = get_tensor_buffer_size(p8_output) / sizeof(uint8_t);
     int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t);
     int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t);
 
-    uint8_t* p8_data_u8  = ( uint8_t* )get_tensor_buffer(p8_output);
-    uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output);
-    uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output);
+    uint8_t* p8_data_u8 = (uint8_t*)get_tensor_buffer(p8_output);
+    uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output);
+    uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output);
 
     std::vector<float> p8_data(p8_count);
     std::vector<float> p16_data(p16_count);
@@ -480,17 +479,17 @@ int main(int argc, char* argv[])
 
     for (int c = 0; c < p8_count; c++)
     {
-        p8_data[c] = (( float )p8_data_u8[c] - ( float )p8_zero_point) * p8_scale;
+        p8_data[c] = ((float)p8_data_u8[c] - (float)p8_zero_point) * p8_scale;
     }
 
     for (int c = 0; c < p16_count; c++)
     {
-        p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale;
+        p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale;
     }
 
     for (int c = 0; c < p32_count; c++)
     {
-        p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale;
+        p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale;
     }
 
     /* postprocess */
@@ -523,7 +522,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)raw_h / img_h;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolov4.cpp b/examples/tm_yolov4.cpp
index b09624624..3dea741c0 100644
--- a/examples/tm_yolov4.cpp
+++ b/examples/tm_yolov4.cpp
@@ -148,15 +148,16 @@ void get_input_data_yolov4(const char* image_file, float* input_data, int img_h,
     /* resize process */
     cv::resize(img, img, cv::Size(img_w, img_h));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c];
             }
@@ -164,7 +165,7 @@ void get_input_data_yolov4(const char* image_file, float* input_data, int img_h,
     }
 }
 
-static void generate_proposals(int stride,  const float* feat, float prob_threshold, std::vector<Object>& objects)
+static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects)
 {
     static float anchors[18] = {12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401};
     int anchor_num = 3;
@@ -172,11 +173,11 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
     int feat_h = 416 / stride;
     int cls_num = 80;
     int anchor_group = 0;
-    if(stride == 8)
+    if (stride == 8)
         anchor_group = 1;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 2;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 3;
 
     for (int h = 0; h <= feat_h - 1; h++)
@@ -192,7 +193,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 {
                     int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size;
                     float score = feat[score_index];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -200,7 +201,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 }
                 float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size];
                 float final_score = sigmoid(box_score) * sigmoid(class_score);
-                if(final_score >= prob_threshold)
+                if (final_score >= prob_threshold)
                 {
                     int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size;
                     int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size;
@@ -218,8 +219,8 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
 
                     float pred_x = (w + dx) * stride;
                     float pred_y = (h + dy) * stride;
-                    float pred_w = exp(dw) * anchor_w ;
-                    float pred_h = exp(dh) * anchor_h ;
+                    float pred_w = exp(dw) * anchor_w;
+                    float pred_h = exp(dh) * anchor_h;
 
                     float x0 = (pred_x - pred_w * 0.5f);
                     float y0 = (pred_y - pred_h * 0.5f);
@@ -233,7 +234,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                     obj.rect.height = y1 - y0;
                     obj.label = class_index;
                     obj.prob = final_score;
-                    objects.push_back(obj); 
+                    objects.push_back(obj);
                 }
             }
         }
@@ -251,8 +252,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    };
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -313,23 +313,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -356,7 +356,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "cv::imread %s failed\n", image_file);
         return -1;
-    }    
+    }
 
     /* set runtime options */
     struct options opt;
@@ -433,18 +433,18 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     tensor_t p8_output = get_graph_output_tensor(graph, 0, 0);
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 2, 0);
-    
-    float* p8_data = ( float*)get_tensor_buffer(p8_output);
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
 
-	/* postprocess */
+    float* p8_data = (float*)get_tensor_buffer(p8_output);
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
+
+    /* postprocess */
     const float prob_threshold = 0.45f;
     const float nms_threshold = 0.25f;
 
@@ -473,7 +473,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)raw_h / img_h;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolov4_tiny.cpp b/examples/tm_yolov4_tiny.cpp
index cd06f604a..512baed34 100644
--- a/examples/tm_yolov4_tiny.cpp
+++ b/examples/tm_yolov4_tiny.cpp
@@ -149,15 +149,16 @@ void get_input_data_yolov4(const char* image_file, float* input_data, int img_h,
     /* resize process */
     cv::resize(img, img, cv::Size(img_w, img_h));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 input_data[out_index] = (img_data[in_index] - mean[c]) * scale[c];
             }
@@ -165,7 +166,7 @@ void get_input_data_yolov4(const char* image_file, float* input_data, int img_h,
     }
 }
 
-static void generate_proposals(int stride,  const float* feat, float prob_threshold, std::vector<Object>& objects)
+static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects)
 {
     static float anchors[12] = {10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319};
     int anchor_num = 3;
@@ -173,9 +174,9 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
     int feat_h = 416 / stride;
     int cls_num = 80;
     int anchor_group = 0;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 1;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 2;
 
     for (int h = 0; h <= feat_h - 1; h++)
@@ -191,7 +192,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 {
                     int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size;
                     float score = feat[score_index];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -199,7 +200,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 }
                 float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size];
                 float final_score = sigmoid(box_score) * sigmoid(class_score);
-                if(final_score >= prob_threshold)
+                if (final_score >= prob_threshold)
                 {
                     int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size;
                     int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size;
@@ -217,8 +218,8 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
 
                     float pred_x = (w + dx) * stride;
                     float pred_y = (h + dy) * stride;
-                    float pred_w = exp(dw) * anchor_w ;
-                    float pred_h = exp(dh) * anchor_h ;
+                    float pred_w = exp(dw) * anchor_w;
+                    float pred_h = exp(dh) * anchor_h;
 
                     float x0 = (pred_x - pred_w * 0.5f);
                     float y0 = (pred_y - pred_h * 0.5f);
@@ -232,7 +233,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                     obj.rect.height = y1 - y0;
                     obj.label = class_index;
                     obj.prob = final_score;
-                    objects.push_back(obj); 
+                    objects.push_back(obj);
                 }
             }
         }
@@ -250,8 +251,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    };
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -312,23 +312,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -355,7 +355,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "cv::imread %s failed\n", image_file);
         return -1;
-    }    
+    }
 
     /* set runtime options */
     struct options opt;
@@ -432,17 +432,16 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
-
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 0, 0);
 
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
 
-	/* postprocess */
+    /* postprocess */
     const float prob_threshold = 0.45f;
     const float nms_threshold = 0.25f;
 
@@ -469,7 +468,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)raw_h / img_h;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolov4_tiny_timvx.cpp b/examples/tm_yolov4_tiny_timvx.cpp
index 0478d0d55..52f362fec 100644
--- a/examples/tm_yolov4_tiny_timvx.cpp
+++ b/examples/tm_yolov4_tiny_timvx.cpp
@@ -134,7 +134,7 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
     }
 }
 
-void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale,
+void get_input_data_yolov4_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale,
                                  float input_scale, int zero_point)
 {
     cv::Mat sample = cv::imread(image_file, 1);
@@ -148,20 +148,21 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i
     /* resize process */
     cv::resize(img, img, cv::Size(img_w, img_h));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 float input_fp32 = (img_data[in_index] - mean[c]) * scale[c];
 
                 /* quant to uint8 */
-                int udata = (round)(input_fp32 / input_scale + ( float )zero_point);
+                int udata = (round)(input_fp32 / input_scale + (float)zero_point);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -173,7 +174,7 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i
     }
 }
 
-static void generate_proposals(int stride,  const float* feat, float prob_threshold, std::vector<Object>& objects)
+static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects)
 {
     static float anchors[12] = {10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319};
     int anchor_num = 3;
@@ -181,9 +182,9 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
     int feat_h = 416 / stride;
     int cls_num = 80;
     int anchor_group = 0;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 1;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 2;
 
     for (int h = 0; h <= feat_h - 1; h++)
@@ -199,7 +200,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 {
                     int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size;
                     float score = feat[score_index];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -207,7 +208,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 }
                 float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size];
                 float final_score = sigmoid(box_score) * sigmoid(class_score);
-                if(final_score >= prob_threshold)
+                if (final_score >= prob_threshold)
                 {
                     int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size;
                     int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size;
@@ -225,8 +226,8 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
 
                     float pred_x = (w + dx) * stride;
                     float pred_y = (h + dy) * stride;
-                    float pred_w = exp(dw) * anchor_w ;
-                    float pred_h = exp(dh) * anchor_h ;
+                    float pred_w = exp(dw) * anchor_w;
+                    float pred_h = exp(dh) * anchor_h;
 
                     float x0 = (pred_x - pred_w * 0.5f);
                     float y0 = (pred_y - pred_h * 0.5f);
@@ -240,7 +241,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                     obj.rect.height = y1 - y0;
                     obj.label = class_index;
                     obj.prob = final_score;
-                    objects.push_back(obj); 
+                    objects.push_back(obj);
                 }
             }
         }
@@ -258,8 +259,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    };
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -320,23 +320,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -363,7 +363,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "cv::imread %s failed\n", image_file);
         return -1;
-    }    
+    }
 
     /* set runtime options */
     struct options opt;
@@ -452,7 +452,7 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* dequant output data */
@@ -470,23 +470,23 @@ int main(int argc, char* argv[])
     int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t);
     int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t);
 
-    uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output);
-    uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output);
+    uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output);
+    uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output);
 
     std::vector<float> p16_data(p16_count);
     std::vector<float> p32_data(p32_count);
 
     for (int c = 0; c < p16_count; c++)
     {
-        p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale;
+        p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale;
     }
 
     for (int c = 0; c < p32_count; c++)
     {
-        p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale;
+        p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale;
     }
 
-	/* postprocess */
+    /* postprocess */
     const float prob_threshold = 0.45f;
     const float nms_threshold = 0.25f;
 
@@ -513,7 +513,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)raw_h / img_h;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolov4_tiny_uint8.cpp b/examples/tm_yolov4_tiny_uint8.cpp
index 2c3c995ac..4ea318c56 100644
--- a/examples/tm_yolov4_tiny_uint8.cpp
+++ b/examples/tm_yolov4_tiny_uint8.cpp
@@ -134,7 +134,7 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
     }
 }
 
-void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale,
+void get_input_data_yolov4_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale,
                                  float input_scale, int zero_point)
 {
     cv::Mat sample = cv::imread(image_file, 1);
@@ -148,20 +148,21 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i
     /* resize process */
     cv::resize(img, img, cv::Size(img_w, img_h));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 float input_fp32 = (img_data[in_index] - mean[c]) * scale[c];
 
                 /* quant to uint8 */
-                int udata = (round)(input_fp32 / input_scale + ( float )zero_point);
+                int udata = (round)(input_fp32 / input_scale + (float)zero_point);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -173,7 +174,7 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i
     }
 }
 
-static void generate_proposals(int stride,  const float* feat, float prob_threshold, std::vector<Object>& objects)
+static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects)
 {
     static float anchors[12] = {10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319};
     int anchor_num = 3;
@@ -181,9 +182,9 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
     int feat_h = 416 / stride;
     int cls_num = 80;
     int anchor_group = 0;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 1;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 2;
 
     for (int h = 0; h <= feat_h - 1; h++)
@@ -199,7 +200,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 {
                     int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size;
                     float score = feat[score_index];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -207,7 +208,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 }
                 float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size];
                 float final_score = sigmoid(box_score) * sigmoid(class_score);
-                if(final_score >= prob_threshold)
+                if (final_score >= prob_threshold)
                 {
                     int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size;
                     int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size;
@@ -225,8 +226,8 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
 
                     float pred_x = (w + dx) * stride;
                     float pred_y = (h + dy) * stride;
-                    float pred_w = exp(dw) * anchor_w ;
-                    float pred_h = exp(dh) * anchor_h ;
+                    float pred_w = exp(dw) * anchor_w;
+                    float pred_h = exp(dh) * anchor_h;
 
                     float x0 = (pred_x - pred_w * 0.5f);
                     float y0 = (pred_y - pred_h * 0.5f);
@@ -240,7 +241,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                     obj.rect.height = y1 - y0;
                     obj.label = class_index;
                     obj.prob = final_score;
-                    objects.push_back(obj); 
+                    objects.push_back(obj);
                 }
             }
         }
@@ -258,8 +259,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    };
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -320,23 +320,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -363,7 +363,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "cv::imread %s failed\n", image_file);
         return -1;
-    }    
+    }
 
     /* set runtime options */
     struct options opt;
@@ -443,7 +443,7 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* dequant output data */
@@ -461,23 +461,23 @@ int main(int argc, char* argv[])
     int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t);
     int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t);
 
-    uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output);
-    uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output);
+    uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output);
+    uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output);
 
     std::vector<float> p16_data(p16_count);
     std::vector<float> p32_data(p32_count);
 
     for (int c = 0; c < p16_count; c++)
     {
-        p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale;
+        p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale;
     }
 
     for (int c = 0; c < p32_count; c++)
     {
-        p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale;
+        p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale;
     }
 
-	/* postprocess */
+    /* postprocess */
     const float prob_threshold = 0.45f;
     const float nms_threshold = 0.25f;
 
@@ -504,7 +504,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)raw_h / img_h;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolov4_uint8.cpp b/examples/tm_yolov4_uint8.cpp
index 78e867979..463ea9d7e 100644
--- a/examples/tm_yolov4_uint8.cpp
+++ b/examples/tm_yolov4_uint8.cpp
@@ -134,7 +134,7 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
     }
 }
 
-void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, int img_h, int img_w, const float* mean, const float* scale,
+void get_input_data_yolov4_uint8(const char* image_file, uint8_t* input_data, int img_h, int img_w, const float* mean, const float* scale,
                                  float input_scale, int zero_point)
 {
     cv::Mat sample = cv::imread(image_file, 1);
@@ -148,20 +148,21 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i
     /* resize process */
     cv::resize(img, img, cv::Size(img_w, img_h));
     img.convertTo(img, CV_32FC3);
-    float* img_data = (float* )img.data;
+    float* img_data = (float*)img.data;
 
     /* nhwc to nchw */
     for (int h = 0; h < img_h; h++)
-    {   for (int w = 0; w < img_w; w++)
+    {
+        for (int w = 0; w < img_w; w++)
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * img_w * 3 + w * 3 + c;
+                int in_index = h * img_w * 3 + w * 3 + c;
                 int out_index = c * img_h * img_w + h * img_w + w;
                 float input_fp32 = (img_data[in_index] - mean[c]) * scale[c];
 
                 /* quant to uint8 */
-                int udata = (round)(input_fp32 / input_scale + ( float )zero_point);
+                int udata = (round)(input_fp32 / input_scale + (float)zero_point);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -173,7 +174,7 @@ void get_input_data_yolov4_uint8(const char* image_file, uint8_t * input_data, i
     }
 }
 
-static void generate_proposals(int stride,  const float* feat, float prob_threshold, std::vector<Object>& objects)
+static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects)
 {
     static float anchors[18] = {12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401};
     int anchor_num = 3;
@@ -181,11 +182,11 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
     int feat_h = 416 / stride;
     int cls_num = 80;
     int anchor_group = 0;
-    if(stride == 8)
+    if (stride == 8)
         anchor_group = 1;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 2;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 3;
 
     for (int h = 0; h <= feat_h - 1; h++)
@@ -201,7 +202,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 {
                     int score_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + (s + 5) * channel_size;
                     float score = feat[score_index];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -209,7 +210,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 }
                 float box_score = feat[anchor * (cls_num + 5) * channel_size + feat_w * h + w + 4 * channel_size];
                 float final_score = sigmoid(box_score) * sigmoid(class_score);
-                if(final_score >= prob_threshold)
+                if (final_score >= prob_threshold)
                 {
                     int dx_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 0 * channel_size;
                     int dy_index = anchor * (cls_num + 5) * channel_size + feat_w * h + w + 1 * channel_size;
@@ -227,8 +228,8 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
 
                     float pred_x = (w + dx) * stride;
                     float pred_y = (h + dy) * stride;
-                    float pred_w = exp(dw) * anchor_w ;
-                    float pred_h = exp(dh) * anchor_h ;
+                    float pred_w = exp(dw) * anchor_w;
+                    float pred_h = exp(dh) * anchor_h;
 
                     float x0 = (pred_x - pred_w * 0.5f);
                     float y0 = (pred_y - pred_h * 0.5f);
@@ -242,7 +243,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                     obj.rect.height = y1 - y0;
                     obj.label = class_index;
                     obj.prob = final_score;
-                    objects.push_back(obj); 
+                    objects.push_back(obj);
                 }
             }
         }
@@ -260,8 +261,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-        "hair drier", "toothbrush"
-    };
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -322,23 +322,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -365,7 +365,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "cv::imread %s failed\n", image_file);
         return -1;
-    }    
+    }
 
     /* set runtime options */
     struct options opt;
@@ -445,18 +445,18 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* dequant output data */
-    tensor_t p8_output  = get_graph_output_tensor(graph, 2, 0);
+    tensor_t p8_output = get_graph_output_tensor(graph, 2, 0);
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 0, 0);
 
-    float p8_scale  = 0.f;
+    float p8_scale = 0.f;
     float p16_scale = 0.f;
     float p32_scale = 0.f;
-    int p8_zero_point  = 0;
+    int p8_zero_point = 0;
     int p16_zero_point = 0;
     int p32_zero_point = 0;
 
@@ -464,13 +464,13 @@ int main(int argc, char* argv[])
     get_tensor_quant_param(p16_output, &p16_scale, &p16_zero_point, 1);
     get_tensor_quant_param(p32_output, &p32_scale, &p32_zero_point, 1);
 
-    int p8_count  = get_tensor_buffer_size(p8_output) / sizeof(uint8_t);
+    int p8_count = get_tensor_buffer_size(p8_output) / sizeof(uint8_t);
     int p16_count = get_tensor_buffer_size(p16_output) / sizeof(uint8_t);
     int p32_count = get_tensor_buffer_size(p32_output) / sizeof(uint8_t);
 
-    uint8_t* p8_data_u8  = ( uint8_t* )get_tensor_buffer(p8_output);
-    uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output);
-    uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output);
+    uint8_t* p8_data_u8 = (uint8_t*)get_tensor_buffer(p8_output);
+    uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output);
+    uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output);
 
     std::vector<float> p8_data(p8_count);
     std::vector<float> p16_data(p16_count);
@@ -478,20 +478,20 @@ int main(int argc, char* argv[])
 
     for (int c = 0; c < p8_count; c++)
     {
-        p8_data[c] = (( float )p8_data_u8[c] - ( float )p8_zero_point) * p8_scale;
+        p8_data[c] = ((float)p8_data_u8[c] - (float)p8_zero_point) * p8_scale;
     }
 
     for (int c = 0; c < p16_count; c++)
     {
-        p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale;
+        p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale;
     }
 
     for (int c = 0; c < p32_count; c++)
     {
-        p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale;
+        p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale;
     }
 
-	/* postprocess */
+    /* postprocess */
     const float prob_threshold = 0.45f;
     const float nms_threshold = 0.25f;
 
@@ -520,7 +520,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)raw_h / img_h;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolov5.cpp b/examples/tm_yolov5.cpp
index acf3e4c56..2debc8ece 100644
--- a/examples/tm_yolov5.cpp
+++ b/examples/tm_yolov5.cpp
@@ -31,7 +31,6 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-
 static constexpr int kAnchorNum = 3;
 static constexpr int kClassNum = 80;
 static constexpr float kIgnoreThresh = 0.5f;
@@ -71,7 +70,7 @@ void correct_yolo_boxes(std::vector<detection>& dets, int w, int h, int netw, in
     int i;
     int new_w = 0;
     int new_h = 0;
-    if ((( float )netw / w) < (( float )neth / h))
+    if (((float)netw / w) < ((float)neth / h))
     {
         new_w = netw;
         new_h = (h * netw) / w;
@@ -85,10 +84,10 @@ void correct_yolo_boxes(std::vector<detection>& dets, int w, int h, int netw, in
     for (i = 0; i < dets.size(); ++i)
     {
         box b = dets[i].bbox;
-        b.x = (b.x - (netw - new_w) / 2.) / (( float )new_w / w);
-        b.y = (b.y - (neth - new_h) / 2.) / (( float )new_h / h);
-        b.w /= (( float )new_w / w);
-        b.h /= (( float )new_h / h);
+        b.x = (b.x - (netw - new_w) / 2.) / ((float)new_w / w);
+        b.y = (b.y - (neth - new_h) / 2.) / ((float)new_h / h);
+        b.w /= ((float)new_w / w);
+        b.h /= ((float)new_h / h);
 
         dets[i].bbox = b;
     }
@@ -123,8 +122,7 @@ std::vector<detection> forward_darknet_layer_cpu(const float* input, int img_w,
         {
             for (int channel = 0; channel < 3; channel++)
             {
-                const float* pdata = input + channel * out_h * out_w * (kClassNum + 5) +
-                                      shift_y * out_w * (kClassNum + 5) + shift_x * (kClassNum + 5);
+                const float* pdata = input + channel * out_h * out_w * (kClassNum + 5) + shift_y * out_w * (kClassNum + 5) + shift_x * (kClassNum + 5);
                 float box_prob = logistic_cpu(*(pdata + 4));
                 if (box_prob < kIgnoreThresh)
                     continue;
@@ -212,7 +210,7 @@ std::vector<detection> do_nms_sort(std::vector<detection>& dets, int classes, fl
     for (int k = 0; k < classes; ++k)
     {
         std::vector<detection> class_detection;
-        for (auto & det : dets)
+        for (auto& det : dets)
         {
             if (det.classes == k)
             {
@@ -220,7 +218,7 @@ std::vector<detection> do_nms_sort(std::vector<detection>& dets, int classes, fl
             }
         }
 
-        std::sort(class_detection.begin(), class_detection.end(), [](const detection & a, const detection & b) {
+        std::sort(class_detection.begin(), class_detection.end(), [](const detection& a, const detection& b) {
             return a.prob > b.prob;
         });
 
@@ -282,28 +280,28 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 's':
-                net_w = std::strtoul(optarg, nullptr, 10);
-                net_h = net_w;
-                fprintf(stderr, "set net input size: %d %d\n", net_h, net_w);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 's':
+            net_w = std::strtoul(optarg, nullptr, 10);
+            net_h = net_w;
+            fprintf(stderr, "set net input size: %d %d\n", net_h, net_w);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -350,7 +348,7 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = net_h * net_w * 3;
-    int dims[] = {1, 3, net_h, net_w};    // nchw
+    int dims[] = {1, 3, net_h, net_w}; // nchw
 
     std::vector<float> input_data(img_size);
 
@@ -418,7 +416,7 @@ int main(int argc, char* argv[])
         int out_dim[5];
         get_tensor_shape(out_tensor, out_dim, 5);
 
-        float* out_data = ( float* )get_tensor_buffer(out_tensor);
+        float* out_data = (float*)get_tensor_buffer(out_tensor);
         int out_w = out_dim[3];
         int out_h = out_dim[2];
         auto node_detection = forward_darknet_layer_cpu(out_data, img.w, img.h, net_w, net_h, out_w, out_h);
diff --git a/examples/tm_yolov5s.cpp b/examples/tm_yolov5s.cpp
index bef5d476b..b8f277a4a 100644
--- a/examples/tm_yolov5s.cpp
+++ b/examples/tm_yolov5s.cpp
@@ -135,9 +135,9 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
     }
 }
 
-
-static void generate_proposals(int stride,  const float* feat, float prob_threshold, std::vector<Object>& objects,
-                               int letterbox_cols, int letterbox_rows){
+static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects,
+                               int letterbox_cols, int letterbox_rows)
+{
     static float anchors[18] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326};
 
     int anchor_num = 3;
@@ -145,11 +145,11 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
     int feat_h = letterbox_rows / stride;
     int cls_num = 80;
     int anchor_group;
-    if(stride == 8)
+    if (stride == 8)
         anchor_group = 1;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 2;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 3;
     for (int h = 0; h <= feat_h - 1; h++)
     {
@@ -163,7 +163,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 for (int s = 0; s <= cls_num - 1; s++)
                 {
                     float score = feat[a * feat_w * feat_h * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5) + s + 5];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -171,7 +171,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 }
                 //process box score
                 float box_score = feat[a * feat_w * feat_h * (cls_num + 5) + (h * feat_w) * (cls_num + 5) + w * (cls_num + 5) + 4];
-                float final_score = sigmoid(box_score ) * sigmoid(class_score);
+                float final_score = sigmoid(box_score) * sigmoid(class_score);
                 if (final_score >= prob_threshold)
                 {
                     int loc_idx = a * feat_h * feat_w * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5);
@@ -207,16 +207,15 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
 static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
 {
     static const char* class_names[] = {
-            "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
-            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
-            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
-            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
-            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
-            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-            "hair drier", "toothbrush"
-    };
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -255,8 +254,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
 void show_usage()
 {
     fprintf(
-            stderr,
-            "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
+        stderr,
+        "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
 }
 
 void get_input_data_focus(const char* image_file, float* input_data, int letterbox_rows, int letterbox_cols, const float* mean, const float* scale)
@@ -273,9 +272,12 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
     float scale_letterbox;
     int resize_rows;
     int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
         scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
+    }
+    else
+    {
         scale_letterbox = letterbox_cols * 1.0 / img.cols;
     }
     resize_cols = int(scale_letterbox * img.cols);
@@ -284,7 +286,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
     cv::resize(img, img, cv::Size(resize_cols, resize_rows));
     img.convertTo(img, CV_32FC3);
     // Generate a gray image for letterbox using opencv
-    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3,cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2]));
+    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2]));
     int top = (letterbox_rows - resize_rows) / 2;
     int bot = (letterbox_rows - resize_rows + 1) / 2;
     int left = (letterbox_cols - resize_cols) / 2;
@@ -293,7 +295,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
     cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
 
     img_new.convertTo(img_new, CV_32FC3);
-    float* img_data   = (float* )img_new.data;
+    float* img_data = (float*)img_new.data;
     std::vector<float> input_temp(3 * letterbox_cols * letterbox_rows);
 
     /* nhwc to nchw */
@@ -303,7 +305,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * letterbox_cols * 3 + w * 3 + c;
+                int in_index = h * letterbox_cols * 3 + w * 3 + c;
                 int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w;
                 input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c];
             }
@@ -317,17 +319,12 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
         {
             for (int c = 0; c < 3; c++)
             {
-                for (int h = 0; h < letterbox_rows/2; h++)
+                for (int h = 0; h < letterbox_rows / 2; h++)
                 {
-                    for (int w = 0; w < letterbox_cols/2; w++)
+                    for (int w = 0; w < letterbox_cols / 2; w++)
                     {
-                        int in_index  = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows +
-                                        h * 2 * letterbox_cols + w * 2;
-                        int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        g * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        c * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        h * (letterbox_cols/2) +
-                                        w;
+                        int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2;
+                        int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w;
 
                         input_data[out_index] = input_temp[in_index];
                     }
@@ -337,7 +334,6 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
     }
 }
 
-
 int main(int argc, char* argv[])
 {
     const char* model_file = nullptr;
@@ -359,23 +355,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -479,7 +475,7 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* yolov5 postprocess */
@@ -490,9 +486,9 @@ int main(int argc, char* argv[])
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 2, 0);
 
-    float* p8_data = ( float*)get_tensor_buffer(p8_output);
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
+    float* p8_data = (float*)get_tensor_buffer(p8_output);
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
 
     /* postprocess */
     const float prob_threshold = 0.25f;
@@ -508,7 +504,7 @@ int main(int argc, char* argv[])
     proposals.insert(proposals.end(), objects32.begin(), objects32.end());
     generate_proposals(16, p16_data, prob_threshold, objects16, letterbox_cols, letterbox_rows);
     proposals.insert(proposals.end(), objects16.begin(), objects16.end());
-    generate_proposals( 8, p8_data, prob_threshold, objects8, letterbox_cols, letterbox_rows);
+    generate_proposals(8, p8_data, prob_threshold, objects8, letterbox_cols, letterbox_rows);
     proposals.insert(proposals.end(), objects8.begin(), objects8.end());
 
     qsort_descent_inplace(proposals);
@@ -520,9 +516,12 @@ int main(int argc, char* argv[])
     float scale_letterbox;
     int resize_rows;
     int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
         scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
+    }
+    else
+    {
         scale_letterbox = letterbox_cols * 1.0 / img.cols;
     }
     resize_cols = int(scale_letterbox * img.cols);
@@ -535,7 +534,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)img.cols / resize_cols;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
@@ -569,4 +568,3 @@ int main(int argc, char* argv[])
     destroy_graph(graph);
     release_tengine();
 }
-
diff --git a/examples/tm_yolov5s_timvx.cpp b/examples/tm_yolov5s_timvx.cpp
index 0152ee175..7f5198951 100644
--- a/examples/tm_yolov5s_timvx.cpp
+++ b/examples/tm_yolov5s_timvx.cpp
@@ -37,7 +37,6 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-
 struct Object
 {
     cv::Rect_<float> rect;
@@ -135,8 +134,7 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
     }
 }
 
-
-static void generate_proposals(int stride,  const float* feat, float prob_threshold, std::vector<Object>& objects,
+static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects,
                                int letterbox_cols, int letterbox_rows)
 {
     static float anchors[18] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326};
@@ -146,11 +144,11 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
     int feat_h = letterbox_rows / stride;
     int cls_num = 80;
     int anchor_group;
-    if(stride == 8)
+    if (stride == 8)
         anchor_group = 1;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 2;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 3;
     for (int h = 0; h <= feat_h - 1; h++)
     {
@@ -164,7 +162,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 for (int s = 0; s <= cls_num - 1; s++)
                 {
                     float score = feat[a * feat_w * feat_h * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5) + s + 5];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -172,7 +170,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 }
                 //process box score
                 float box_score = feat[a * feat_w * feat_h * (cls_num + 5) + (h * feat_w) * (cls_num + 5) + w * (cls_num + 5) + 4];
-                float final_score = sigmoid(box_score ) * sigmoid(class_score);
+                float final_score = sigmoid(box_score) * sigmoid(class_score);
                 if (final_score >= prob_threshold)
                 {
                     int loc_idx = a * feat_h * feat_w * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5);
@@ -208,16 +206,15 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
 static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
 {
     static const char* class_names[] = {
-            "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
-            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
-            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
-            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
-            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
-            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-            "hair drier", "toothbrush"
-    };
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -256,8 +253,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
 void show_usage()
 {
     fprintf(
-            stderr,
-            "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
+        stderr,
+        "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
 }
 
 void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int letterbox_rows, int letterbox_cols, const float* mean,
@@ -275,9 +272,12 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
     float scale_letterbox;
     int resize_rows;
     int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
         scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
+    }
+    else
+    {
         scale_letterbox = letterbox_cols * 1.0 / img.cols;
     }
     resize_cols = int(scale_letterbox * img.cols);
@@ -286,7 +286,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
     cv::resize(img, img, cv::Size(resize_cols, resize_rows));
     img.convertTo(img, CV_32FC3);
     // Generate a gray image for letterbox using opencv
-    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3,cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2]));
+    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2]));
     int top = (letterbox_rows - resize_rows) / 2;
     int bot = (letterbox_rows - resize_rows + 1) / 2;
     int left = (letterbox_cols - resize_cols) / 2;
@@ -295,7 +295,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
     cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
 
     img_new.convertTo(img_new, CV_32FC3);
-    float* img_data   = (float* )img_new.data;
+    float* img_data = (float*)img_new.data;
     std::vector<float> input_temp(3 * letterbox_cols * letterbox_rows);
 
     /* nhwc to nchw */
@@ -305,7 +305,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * letterbox_cols * 3 + w * 3 + c;
+                int in_index = h * letterbox_cols * 3 + w * 3 + c;
                 int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w;
                 input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c];
             }
@@ -319,20 +319,15 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
         {
             for (int c = 0; c < 3; c++)
             {
-                for (int h = 0; h < letterbox_rows/2; h++)
+                for (int h = 0; h < letterbox_rows / 2; h++)
                 {
-                    for (int w = 0; w < letterbox_cols/2; w++)
+                    for (int w = 0; w < letterbox_cols / 2; w++)
                     {
-                        int in_index  = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows +
-                                        h * 2 * letterbox_cols + w * 2;
-                        int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        g * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        c * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        h * (letterbox_cols/2) +
-                                        w;
+                        int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2;
+                        int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w;
 
                         /* quant to uint8 */
-                        int udata = (round)(input_temp[in_index] / input_scale + ( float )zero_point);
+                        int udata = (round)(input_temp[in_index] / input_scale + (float)zero_point);
                         if (udata > 255)
                             udata = 255;
                         else if (udata < 0)
@@ -366,23 +361,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -498,7 +493,7 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* yolov5 postprocess */
@@ -529,23 +524,23 @@ int main(int argc, char* argv[])
     std::vector<float> p16_data(p16_count);
     std::vector<float> p32_data(p32_count);
 
-    uint8_t* p8_data_u8  = ( uint8_t* )get_tensor_buffer(p8_output);
-    uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output);
-    uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output);
+    uint8_t* p8_data_u8 = (uint8_t*)get_tensor_buffer(p8_output);
+    uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output);
+    uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output);
 
     for (int c = 0; c < p8_count; c++)
     {
-        p8_data[c] = (( float )p8_data_u8[c] - ( float )p8_zero_point) * p8_scale;
+        p8_data[c] = ((float)p8_data_u8[c] - (float)p8_zero_point) * p8_scale;
     }
 
     for (int c = 0; c < p16_count; c++)
     {
-        p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale;
+        p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale;
     }
 
     for (int c = 0; c < p32_count; c++)
     {
-        p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale;
+        p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale;
     }
 
     /* postprocess */
@@ -562,7 +557,7 @@ int main(int argc, char* argv[])
     proposals.insert(proposals.end(), objects32.begin(), objects32.end());
     generate_proposals(16, p16_data.data(), prob_threshold, objects16, letterbox_cols, letterbox_rows);
     proposals.insert(proposals.end(), objects16.begin(), objects16.end());
-    generate_proposals( 8, p8_data.data(), prob_threshold, objects8, letterbox_cols, letterbox_rows);
+    generate_proposals(8, p8_data.data(), prob_threshold, objects8, letterbox_cols, letterbox_rows);
     proposals.insert(proposals.end(), objects8.begin(), objects8.end());
 
     qsort_descent_inplace(proposals);
@@ -574,9 +569,12 @@ int main(int argc, char* argv[])
     float scale_letterbox;
     int resize_rows;
     int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
         scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
+    }
+    else
+    {
         scale_letterbox = letterbox_cols * 1.0 / img.cols;
     }
     resize_cols = int(scale_letterbox * img.cols);
@@ -589,7 +587,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)img.cols / resize_cols;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
@@ -623,4 +621,3 @@ int main(int argc, char* argv[])
     destroy_graph(graph);
     release_tengine();
 }
-
diff --git a/examples/tm_yolox.cpp b/examples/tm_yolox.cpp
index 9f54b5806..e9da0dd5e 100644
--- a/examples/tm_yolox.cpp
+++ b/examples/tm_yolox.cpp
@@ -45,7 +45,6 @@ struct Object
     float prob;
 };
 
-
 static inline float intersection_area(const Object& a, const Object& b)
 {
     cv::Rect_<float> inter = a.rect & b.rect;
@@ -134,16 +133,15 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
 static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
 {
     static const char* class_names[] = {
-            "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
-            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
-            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
-            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
-            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
-            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-            "hair drier", "toothbrush"
-    };
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -175,7 +173,7 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
         cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5,
                     cv::Scalar(0, 0, 0));
     }
-    
+
     cv::imwrite("yolox_out.jpg", image);
 }
 struct GridAndStride
@@ -211,10 +209,10 @@ static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, fl
     const int num_grid = 3549;
     const int num_class = 80;
     const int num_anchors = grid_strides.size();
-    
+
     for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
     {
-       // printf("%d,%d\n",num_anchors,anchor_idx);
+        // printf("%d,%d\n",num_anchors,anchor_idx);
         const int grid0 = grid_strides[anchor_idx].grid0;
         const int grid1 = grid_strides[anchor_idx].grid1;
         const int stride = grid_strides[anchor_idx].stride;
@@ -228,7 +226,7 @@ static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, fl
         float h = exp(feat_ptr[3]) * stride;
         float x0 = x_center - w * 0.5f;
         float y0 = y_center - h * 0.5f;
-        
+
         float box_objectness = feat_ptr[4];
 
         for (int class_idx = 0; class_idx < num_class; class_idx++)
@@ -252,13 +250,13 @@ static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, fl
         feat_ptr += 85;
 
     } // point anchor loop
-}   
+}
 
 void show_usage()
 {
     fprintf(
-            stderr,
-            "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
+        stderr,
+        "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
 }
 
 void get_input_data_focus(const char* image_file, float* input_data, int letterbox_rows, int letterbox_cols, const float* mean, const float* scale)
@@ -275,9 +273,12 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
     float scale_letterbox;
     int resize_rows;
     int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
         scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
+    }
+    else
+    {
         scale_letterbox = letterbox_cols * 1.0 / img.cols;
     }
     resize_cols = int(scale_letterbox * img.cols);
@@ -287,7 +288,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
 
     img.convertTo(img, CV_32FC3);
     // Generate a gray image for letterbox using opencv
-    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0, 0, 0)/*cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])*/);
+    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0, 0, 0) /*cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])*/);
     int top = 0;
     int bot = letterbox_rows - resize_rows;
     int left = 0;
@@ -296,7 +297,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
     cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(114.f, 114.f, 114.f));
 
     img_new.convertTo(img_new, CV_32FC3);
-    float* img_data   = (float* )img_new.data;
+    float* img_data = (float*)img_new.data;
     std::vector<float> input_temp(3 * letterbox_cols * letterbox_rows);
 
     /* nhwc to nchw */
@@ -306,7 +307,7 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * letterbox_cols * 3 + w * 3 + c;
+                int in_index = h * letterbox_cols * 3 + w * 3 + c;
                 int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w;
                 input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c];
             }
@@ -320,17 +321,12 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
         {
             for (int c = 0; c < 3; c++)
             {
-                for (int h = 0; h < letterbox_rows/2; h++)
+                for (int h = 0; h < letterbox_rows / 2; h++)
                 {
-                    for (int w = 0; w < letterbox_cols/2; w++)
+                    for (int w = 0; w < letterbox_cols / 2; w++)
                     {
-                        int in_index  = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows +
-                                        h * 2 * letterbox_cols + w * 2;
-                        int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        g * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        c * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        h * (letterbox_cols/2) +
-                                        w;
+                        int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2;
+                        int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w;
 
                         input_data[out_index] = input_temp[in_index];
                     }
@@ -340,7 +336,6 @@ void get_input_data_focus(const char* image_file, float* input_data, int letterb
     }
 }
 
-
 int main(int argc, char* argv[])
 {
     const char* model_file = nullptr;
@@ -362,23 +357,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -435,7 +430,7 @@ int main(int argc, char* argv[])
     std::vector<float> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
-	
+
     if (input_tensor == nullptr)
     {
         fprintf(stderr, "Get input tensor failed\n");
@@ -483,16 +478,16 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* yolox postprocess */
     tensor_t p8_output = get_graph_output_tensor(graph, 0, 0);
-    float* p8_data = ( float*)get_tensor_buffer(p8_output);
+    float* p8_data = (float*)get_tensor_buffer(p8_output);
 
     /* postprocess */
-	const float prob_threshold = 0.3f;
-	const float nms_threshold = 0.65f;
+    const float prob_threshold = 0.3f;
+    const float nms_threshold = 0.65f;
 
     std::vector<Object> proposals;
     std::vector<Object> objects;
@@ -509,14 +504,17 @@ int main(int argc, char* argv[])
     float scale_letterbox;
     int resize_rows;
     int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
         scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
+    }
+    else
+    {
         scale_letterbox = letterbox_cols * 1.0 / img.cols;
     }
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
diff --git a/examples/tm_yolox_timvx.cpp b/examples/tm_yolox_timvx.cpp
index da778080d..2aced6f2e 100644
--- a/examples/tm_yolox_timvx.cpp
+++ b/examples/tm_yolox_timvx.cpp
@@ -1,578 +1,575 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2021, OPEN AI LAB
- * Author: xwwang@openailab.com
- * Author: 774074168@qq.com
- * Author: honghao@openailab.com
- * original model: https://github.com/Megvii-BaseDetection/YOLOX
- */
-
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <cmath>
-#include <stdlib.h>
-#include <opencv2/core/core.hpp>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-
-#include "common.h"
-#include "tengine/c_api.h"
-#include "tengine_operations.h"
-
-struct Object
-{
-    cv::Rect_<float> rect;
-    int label;
-    float prob;
-};
-
-
-static inline float intersection_area(const Object& a, const Object& b)
-{
-    cv::Rect_<float> inter = a.rect & b.rect;
-    return inter.area();
-}
-
-static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
-{
-    int i = left;
-    int j = right;
-    float p = faceobjects[(left + right) / 2].prob;
-
-    while (i <= j)
-    {
-        while (faceobjects[i].prob > p)
-            i++;
-
-        while (faceobjects[j].prob < p)
-            j--;
-
-        if (i <= j)
-        {
-            // swap
-            std::swap(faceobjects[i], faceobjects[j]);
-
-            i++;
-            j--;
-        }
-    }
-
-#pragma omp parallel sections
-    {
-#pragma omp section
-        {
-            if (left < j) qsort_descent_inplace(faceobjects, left, j);
-        }
-#pragma omp section
-        {
-            if (i < right) qsort_descent_inplace(faceobjects, i, right);
-        }
-    }
-}
-
-static void qsort_descent_inplace(std::vector<Object>& faceobjects)
-{
-    if (faceobjects.empty())
-        return;
-
-    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
-}
-
-static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
-{
-    picked.clear();
-
-    const int n = faceobjects.size();
-
-    std::vector<float> areas(n);
-    for (int i = 0; i < n; i++)
-    {
-        areas[i] = faceobjects[i].rect.area();
-    }
-
-    for (int i = 0; i < n; i++)
-    {
-        const Object& a = faceobjects[i];
-
-        int keep = 1;
-        for (int j = 0; j < (int)picked.size(); j++)
-        {
-            const Object& b = faceobjects[picked[j]];
-
-            // intersection over union
-            float inter_area = intersection_area(a, b);
-            float union_area = areas[i] + areas[picked[j]] - inter_area;
-            // float IoU = inter_area / union_area
-            if (inter_area / union_area > nms_threshold)
-                keep = 0;
-        }
-
-        if (keep)
-            picked.push_back(i);
-    }
-}
-
-
-static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
-{
-    static const char* class_names[] = {
-            "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
-            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
-            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
-            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
-            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
-            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-            "hair drier", "toothbrush"
-    };
-
-    cv::Mat image = bgr.clone();
-
-    for (size_t i = 0; i < objects.size(); i++)
-    {
-        const Object& obj = objects[i];
-
-        fprintf(stderr, "%2d: %3.0f%%, [%4.0f, %4.0f, %4.0f, %4.0f], %s\n", obj.label, obj.prob * 100, obj.rect.x,
-                obj.rect.y, obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]);
-
-        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
-
-        char text[256];
-        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
-
-        int baseLine = 0;
-        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
-
-        int x = obj.rect.x;
-        int y = obj.rect.y - label_size.height - baseLine;
-        if (y < 0)
-            y = 0;
-        if (x + label_size.width > image.cols)
-            x = image.cols - label_size.width;
-
-        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
-                      cv::Scalar(255, 255, 255), -1);
-
-        cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5,
-                    cv::Scalar(0, 0, 0));
-    }
-
-    cv::imwrite("yolox_timvx_out.jpg", image);
-}
-
-struct GridAndStride
-{
-    int grid0;
-    int grid1;
-    int stride;
-};
-
-static int generate_grids_and_stride(const int target_size, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
-{
-    for (auto stride : strides)
-    {
-        int num_grid = target_size / stride;
-        for (int g1 = 0; g1 < num_grid; g1++)
-        {
-            for (int g0 = 0; g0 < num_grid; g0++)
-            {
-                GridAndStride ss;
-                ss.grid0 = g0;
-                ss.grid1 = g1;
-                ss.stride = stride;
-                grid_strides.push_back(ss);
-            }
-        }
-    }
-
-    return 0;
-}
-
-static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, float* feat_ptr, float prob_threshold, std::vector<Object>& objects)
-{
-    const int num_grid = 3549;
-    const int num_class = 80;
-    const int num_anchors = grid_strides.size();
-
-    //const float* feat_ptr = feat_blob;
-    for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
-    {
-        // printf("%d,%d\n",num_anchors,anchor_idx);
-        const int grid0 = grid_strides[anchor_idx].grid0;
-        const int grid1 = grid_strides[anchor_idx].grid1;
-        const int stride = grid_strides[anchor_idx].stride;
-
-        // yolox/models/yolo_head.py decode logic
-        //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
-        //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
-        float x_center = (feat_ptr[0] + grid0) * stride;
-        float y_center = (feat_ptr[1] + grid1) * stride;
-        float w = exp(feat_ptr[2]) * stride;
-        float h = exp(feat_ptr[3]) * stride;
-        float x0 = x_center - w * 0.5f;
-        float y0 = y_center - h * 0.5f;
-
-        float box_objectness = feat_ptr[4];
-
-        for (int class_idx = 0; class_idx < num_class; class_idx++)
-        {
-            float box_cls_score = feat_ptr[5 + class_idx];
-            float box_prob = box_objectness * box_cls_score;
-            if (box_prob > prob_threshold)
-            {
-                Object obj;
-                obj.rect.x = x0;
-                obj.rect.y = y0;
-                obj.rect.width = w;
-                obj.rect.height = h;
-                obj.label = class_idx;
-                obj.prob = box_prob;
-
-                objects.push_back(obj);
-            }
-
-        } // class loop
-        feat_ptr += 85;
-
-    } // point anchor loop
-}
-
-void show_usage()
-{
-    fprintf(
-            stderr,
-            "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
-}
-
-void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int letterbox_rows, int letterbox_cols, const float* mean,
-                                const float* scale, float input_scale, int zero_point)
-{
-    cv::Mat sample = cv::imread(image_file, 1);
-    cv::Mat img;
-
-    if (sample.channels() == 1)
-        cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB);
-    else
-        cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
-
-    /* letterbox process to support different letterbox size */
-    float scale_letterbox;
-    int resize_rows;
-    int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
-        scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
-        scale_letterbox = letterbox_cols * 1.0 / img.cols;
-    }
-    resize_cols = int(scale_letterbox * img.cols);
-    resize_rows = int(scale_letterbox * img.rows);
-
-    cv::resize(img, img, cv::Size(resize_cols, resize_rows));
-
-    img.convertTo(img, CV_32FC3);
-    // Generate a gray image for letterbox using opencv
-    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0, 0, 0)/*cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])*/);
-    int top = 0;
-    int bot = letterbox_rows - resize_rows;
-    int left = 0;
-    int right = letterbox_cols - resize_cols;
-    // Letterbox filling
-    cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(114.f, 114.f, 114.f));
-
-    img_new.convertTo(img_new, CV_32FC3);
-    float* img_data   = (float* )img_new.data;
-    std::vector<float> input_temp(3 * letterbox_cols * letterbox_rows);
-
-    /* nhwc to nchw */
-    for (int h = 0; h < letterbox_rows; h++)
-    {
-        for (int w = 0; w < letterbox_cols; w++)
-        {
-            for (int c = 0; c < 3; c++)
-            {
-                int in_index  = h * letterbox_cols * 3 + w * 3 + c;
-                int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w;
-                input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c];
-            }
-        }
-    }
-
-    /* focus process */
-    for (int i = 0; i < 2; i++) // corresponding to rows
-    {
-        for (int g = 0; g < 2; g++) // corresponding to cols
-        {
-            for (int c = 0; c < 3; c++)
-            {
-                for (int h = 0; h < letterbox_rows/2; h++)
-                {
-                    for (int w = 0; w < letterbox_cols/2; w++)
-                    {
-                        int in_index  = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows +
-                                        h * 2 * letterbox_cols + w * 2;
-                        int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        g * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        c * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        h * (letterbox_cols/2) +
-                                        w;
-
-                        /* quant to uint8 */
-                        int udata = (round)(input_temp[in_index] / input_scale + ( float )zero_point);
-                        if (udata > 255)
-                            udata = 255;
-                        else if (udata < 0)
-                            udata = 0;
-
-                        input_data[out_index] = udata;
-                    }
-                }
-            }
-        }
-    }
-}
-
-
-int main(int argc, char* argv[])
-{
-    const char* model_file = nullptr;
-    const char* image_file = nullptr;
-
-    int img_c = 3;
-    const float mean[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f};
-    const float scale[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)};
-
-    // allow none square letterbox, set default letterbox size
-    int letterbox_rows = 640;
-    int letterbox_cols = 640;
-
-    int repeat_count = 1;
-    int num_thread = 1;
-
-    int res;
-    while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1)
-    {
-        switch (res)
-        {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
-        }
-    }
-
-    /* check files */
-    if (nullptr == model_file)
-    {
-        fprintf(stderr, "Error: Tengine model file not specified!\n");
-        show_usage();
-        return -1;
-    }
-
-    if (nullptr == image_file)
-    {
-        fprintf(stderr, "Error: Image file not specified!\n");
-        show_usage();
-        return -1;
-    }
-
-    if (!check_file_exist(model_file) || !check_file_exist(image_file))
-        return -1;
-
-    cv::Mat img = cv::imread(image_file, 1);
-    if (img.empty())
-    {
-        fprintf(stderr, "cv::imread %s failed\n", image_file);
-        return -1;
-    }
-
-    /* set runtime options */
-    struct options opt;
-    opt.num_thread = num_thread;
-    opt.cluster = TENGINE_CLUSTER_ALL;
-    opt.precision = TENGINE_MODE_UINT8;
-    opt.affinity = 0;
-
-    /* inital tengine */
-    if (init_tengine() != 0)
-    {
-        fprintf(stderr, "Initial tengine failed.\n");
-        return -1;
-    }
-    fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
-
-    /* create VeriSilicon TIM-VX backend */
-    context_t timvx_context = create_context("timvx", 1);
-    int rtt = set_context_device(timvx_context, "TIMVX", NULL, 0);
-    if (0 > rtt)
-    {
-        fprintf(stderr, " add_context_device VSI DEVICE failed.\n");
-        return -1;
-    }
-
-    /* create graph, load tengine model xxx.tmfile */
-    graph_t graph = create_graph(timvx_context, "tengine", model_file);
-    if (graph == nullptr)
-    {
-        fprintf(stderr, "Create graph failed.\n");
-        return -1;
-    }
-
-    int img_size = letterbox_rows * letterbox_cols * img_c;
-    int dims[] = {1, 12, int(letterbox_rows / 2), int(letterbox_cols / 2)};
-    std::vector<uint8_t> input_data(img_size);
-
-    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
-
-    if (input_tensor == nullptr)
-    {
-        fprintf(stderr, "Get input tensor failed\n");
-        return -1;
-    }
-
-    if (set_tensor_shape(input_tensor, dims, 4) < 0)
-    {
-        fprintf(stderr, "Set input tensor shape failed\n");
-        return -1;
-    }
-
-    if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0)
-    {
-        fprintf(stderr, "Set input tensor buffer failed\n");
-        return -1;
-    }
-
-    /* prerun graph, set work options(num_thread, cluster, precision) */
-    if (prerun_graph_multithread(graph, opt) < 0)
-    {
-        fprintf(stderr, "Prerun multithread graph failed.\n");
-        return -1;
-    }
-
-    /* prepare process input data, set the data mem to input tensor */
-    float input_scale = 0.f;
-    int input_zero_point = 0;
-    get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    get_input_data_focus_uint8(image_file, input_data.data(), letterbox_rows, letterbox_cols, mean, scale, input_scale, input_zero_point);
-
-    /* run graph */
-    double min_time = DBL_MAX;
-    double max_time = DBL_MIN;
-    double total_time = 0.;
-    for (int i = 0; i < repeat_count; i++)
-    {
-        double start = get_current_time();
-        if (run_graph(graph, 1) < 0)
-        {
-            fprintf(stderr, "Run graph failed\n");
-            return -1;
-        }
-        double end = get_current_time();
-        double cur = end - start;
-        total_time += cur;
-        min_time = std::min(min_time, cur);
-        max_time = std::max(max_time, cur);
-    }
-    fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
-    fprintf(stderr, "--------------------------------------\n");
-
-    /* yolox postprocess */
-    tensor_t p8_output = get_graph_output_tensor(graph, 0, 0);
-    uint8_t * output_u8 = ( uint8_t*)get_tensor_buffer(p8_output);
-    int output_size = get_tensor_buffer_size(p8_output);
-
-    /* dequant */
-    float output_scale = 0.f;
-    int output_zero_point = 0;
-    get_tensor_quant_param(p8_output, &output_scale, &output_zero_point, 1);
-    std::vector<float> p8_data(output_size);
-    for (int i = 0; i < output_size; i++)
-        p8_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    /* postprocess */
-    const float prob_threshold = 0.3f;
-    const float nms_threshold = 0.65f;
-
-    std::vector<Object> proposals;
-    std::vector<Object> objects;
-
-    std::vector<int> strides = {8, 16, 32}; // might have stride=64
-    std::vector<GridAndStride> grid_strides;
-    generate_grids_and_stride(letterbox_rows, strides, grid_strides);
-    generate_yolox_proposals(grid_strides, p8_data.data(), prob_threshold, proposals);
-    qsort_descent_inplace(proposals);
-    std::vector<int> picked;
-    nms_sorted_bboxes(proposals, picked, nms_threshold);
-
-    /* yolox draw the result */
-    float scale_letterbox;
-    int resize_rows;
-    int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
-        scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
-        scale_letterbox = letterbox_cols * 1.0 / img.cols;
-    }
-
-    int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
-
-    objects.resize(count);
-    for (int i = 0; i < count; i++)
-    {
-        objects[i] = proposals[picked[i]];
-        float x0 = (objects[i].rect.x) / scale_letterbox;
-        float y0 = (objects[i].rect.y) / scale_letterbox;
-        float x1 = (objects[i].rect.x + objects[i].rect.width) / scale_letterbox;
-        float y1 = (objects[i].rect.y + objects[i].rect.height) / scale_letterbox;
-        x0 = std::max(std::min(x0, (float)(img.cols - 1)), 0.f);
-        y0 = std::max(std::min(y0, (float)(img.rows - 1)), 0.f);
-        x1 = std::max(std::min(x1, (float)(img.cols - 1)), 0.f);
-        y1 = std::max(std::min(y1, (float)(img.rows - 1)), 0.f);
-
-        objects[i].rect.x = x0;
-        objects[i].rect.y = y0;
-        objects[i].rect.width = x1 - x0;
-        objects[i].rect.height = y1 - y0;
-    }
-
-    draw_objects(img, objects);
-
-    /* release tengine */
-    postrun_graph(graph);
-    destroy_graph(graph);
-    release_tengine();
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: xwwang@openailab.com
+ * Author: 774074168@qq.com
+ * Author: honghao@openailab.com
+ * original model: https://github.com/Megvii-BaseDetection/YOLOX
+ */
+
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <cmath>
+#include <stdlib.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "common.h"
+#include "tengine/c_api.h"
+#include "tengine_operations.h"
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+#pragma omp parallel sections
+    {
+#pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+#pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"};
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%2d: %3.0f%%, [%4.0f, %4.0f, %4.0f, %4.0f], %s\n", obj.label, obj.prob * 100, obj.rect.x,
+                obj.rect.y, obj.rect.x + obj.rect.width, obj.rect.y + obj.rect.height, class_names[obj.label]);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5,
+                    cv::Scalar(0, 0, 0));
+    }
+
+    cv::imwrite("yolox_timvx_out.jpg", image);
+}
+
+struct GridAndStride
+{
+    int grid0;
+    int grid1;
+    int stride;
+};
+
+static int generate_grids_and_stride(const int target_size, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
+{
+    for (auto stride : strides)
+    {
+        int num_grid = target_size / stride;
+        for (int g1 = 0; g1 < num_grid; g1++)
+        {
+            for (int g0 = 0; g0 < num_grid; g0++)
+            {
+                GridAndStride ss;
+                ss.grid0 = g0;
+                ss.grid1 = g1;
+                ss.stride = stride;
+                grid_strides.push_back(ss);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, float* feat_ptr, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = 3549;
+    const int num_class = 80;
+    const int num_anchors = grid_strides.size();
+
+    //const float* feat_ptr = feat_blob;
+    for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
+    {
+        // printf("%d,%d\n",num_anchors,anchor_idx);
+        const int grid0 = grid_strides[anchor_idx].grid0;
+        const int grid1 = grid_strides[anchor_idx].grid1;
+        const int stride = grid_strides[anchor_idx].stride;
+
+        // yolox/models/yolo_head.py decode logic
+        //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        float x_center = (feat_ptr[0] + grid0) * stride;
+        float y_center = (feat_ptr[1] + grid1) * stride;
+        float w = exp(feat_ptr[2]) * stride;
+        float h = exp(feat_ptr[3]) * stride;
+        float x0 = x_center - w * 0.5f;
+        float y0 = y_center - h * 0.5f;
+
+        float box_objectness = feat_ptr[4];
+
+        for (int class_idx = 0; class_idx < num_class; class_idx++)
+        {
+            float box_cls_score = feat_ptr[5 + class_idx];
+            float box_prob = box_objectness * box_cls_score;
+            if (box_prob > prob_threshold)
+            {
+                Object obj;
+                obj.rect.x = x0;
+                obj.rect.y = y0;
+                obj.rect.width = w;
+                obj.rect.height = h;
+                obj.label = class_idx;
+                obj.prob = box_prob;
+
+                objects.push_back(obj);
+            }
+
+        } // class loop
+        feat_ptr += 85;
+
+    } // point anchor loop
+}
+
+void show_usage()
+{
+    fprintf(
+        stderr,
+        "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
+}
+
+void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int letterbox_rows, int letterbox_cols, const float* mean,
+                                const float* scale, float input_scale, int zero_point)
+{
+    cv::Mat sample = cv::imread(image_file, 1);
+    cv::Mat img;
+
+    if (sample.channels() == 1)
+        cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB);
+    else
+        cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
+
+    /* letterbox process to support different letterbox size */
+    float scale_letterbox;
+    int resize_rows;
+    int resize_cols;
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
+        scale_letterbox = letterbox_rows * 1.0 / img.rows;
+    }
+    else
+    {
+        scale_letterbox = letterbox_cols * 1.0 / img.cols;
+    }
+    resize_cols = int(scale_letterbox * img.cols);
+    resize_rows = int(scale_letterbox * img.rows);
+
+    cv::resize(img, img, cv::Size(resize_cols, resize_rows));
+
+    img.convertTo(img, CV_32FC3);
+    // Generate a gray image for letterbox using opencv
+    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0, 0, 0) /*cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2])*/);
+    int top = 0;
+    int bot = letterbox_rows - resize_rows;
+    int left = 0;
+    int right = letterbox_cols - resize_cols;
+    // Letterbox filling
+    cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(114.f, 114.f, 114.f));
+
+    img_new.convertTo(img_new, CV_32FC3);
+    float* img_data = (float*)img_new.data;
+    std::vector<float> input_temp(3 * letterbox_cols * letterbox_rows);
+
+    /* nhwc to nchw */
+    for (int h = 0; h < letterbox_rows; h++)
+    {
+        for (int w = 0; w < letterbox_cols; w++)
+        {
+            for (int c = 0; c < 3; c++)
+            {
+                int in_index = h * letterbox_cols * 3 + w * 3 + c;
+                int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w;
+                input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c];
+            }
+        }
+    }
+
+    /* focus process */
+    for (int i = 0; i < 2; i++) // corresponding to rows
+    {
+        for (int g = 0; g < 2; g++) // corresponding to cols
+        {
+            for (int c = 0; c < 3; c++)
+            {
+                for (int h = 0; h < letterbox_rows / 2; h++)
+                {
+                    for (int w = 0; w < letterbox_cols / 2; w++)
+                    {
+                        int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2;
+                        int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w;
+
+                        /* quant to uint8 */
+                        int udata = (round)(input_temp[in_index] / input_scale + (float)zero_point);
+                        if (udata > 255)
+                            udata = 255;
+                        else if (udata < 0)
+                            udata = 0;
+
+                        input_data[out_index] = udata;
+                    }
+                }
+            }
+        }
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    const char* model_file = nullptr;
+    const char* image_file = nullptr;
+
+    int img_c = 3;
+    const float mean[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f};
+    const float scale[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)};
+
+    // allow none square letterbox, set default letterbox size
+    int letterbox_rows = 640;
+    int letterbox_cols = 640;
+
+    int repeat_count = 1;
+    int num_thread = 1;
+
+    int res;
+    while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1)
+    {
+        switch (res)
+        {
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
+        }
+    }
+
+    /* check files */
+    if (nullptr == model_file)
+    {
+        fprintf(stderr, "Error: Tengine model file not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (nullptr == image_file)
+    {
+        fprintf(stderr, "Error: Image file not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (!check_file_exist(model_file) || !check_file_exist(image_file))
+        return -1;
+
+    cv::Mat img = cv::imread(image_file, 1);
+    if (img.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", image_file);
+        return -1;
+    }
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = num_thread;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_UINT8;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+    fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
+
+    /* create VeriSilicon TIM-VX backend */
+    context_t timvx_context = create_context("timvx", 1);
+    int rtt = set_context_device(timvx_context, "TIMVX", NULL, 0);
+    if (0 > rtt)
+    {
+        fprintf(stderr, " add_context_device VSI DEVICE failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(timvx_context, "tengine", model_file);
+    if (graph == nullptr)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    int img_size = letterbox_rows * letterbox_cols * img_c;
+    int dims[] = {1, 12, int(letterbox_rows / 2), int(letterbox_cols / 2)};
+    std::vector<uint8_t> input_data(img_size);
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+
+    if (input_tensor == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor, input_data.data(), img_size) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    float input_scale = 0.f;
+    int input_zero_point = 0;
+    get_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    get_input_data_focus_uint8(image_file, input_data.data(), letterbox_rows, letterbox_cols, mean, scale, input_scale, input_zero_point);
+
+    /* run graph */
+    double min_time = DBL_MAX;
+    double max_time = DBL_MIN;
+    double total_time = 0.;
+    for (int i = 0; i < repeat_count; i++)
+    {
+        double start = get_current_time();
+        if (run_graph(graph, 1) < 0)
+        {
+            fprintf(stderr, "Run graph failed\n");
+            return -1;
+        }
+        double end = get_current_time();
+        double cur = end - start;
+        total_time += cur;
+        min_time = std::min(min_time, cur);
+        max_time = std::max(max_time, cur);
+    }
+    fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
+            total_time / repeat_count, max_time, min_time);
+    fprintf(stderr, "--------------------------------------\n");
+
+    /* yolox postprocess */
+    tensor_t p8_output = get_graph_output_tensor(graph, 0, 0);
+    uint8_t* output_u8 = (uint8_t*)get_tensor_buffer(p8_output);
+    int output_size = get_tensor_buffer_size(p8_output);
+
+    /* dequant */
+    float output_scale = 0.f;
+    int output_zero_point = 0;
+    get_tensor_quant_param(p8_output, &output_scale, &output_zero_point, 1);
+    std::vector<float> p8_data(output_size);
+    for (int i = 0; i < output_size; i++)
+        p8_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    /* postprocess */
+    const float prob_threshold = 0.3f;
+    const float nms_threshold = 0.65f;
+
+    std::vector<Object> proposals;
+    std::vector<Object> objects;
+
+    std::vector<int> strides = {8, 16, 32}; // might have stride=64
+    std::vector<GridAndStride> grid_strides;
+    generate_grids_and_stride(letterbox_rows, strides, grid_strides);
+    generate_yolox_proposals(grid_strides, p8_data.data(), prob_threshold, proposals);
+    qsort_descent_inplace(proposals);
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    /* yolox draw the result */
+    float scale_letterbox;
+    int resize_rows;
+    int resize_cols;
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
+        scale_letterbox = letterbox_rows * 1.0 / img.rows;
+    }
+    else
+    {
+        scale_letterbox = letterbox_cols * 1.0 / img.cols;
+    }
+
+    int count = picked.size();
+    fprintf(stderr, "detection num: %d\n", count);
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+        float x0 = (objects[i].rect.x) / scale_letterbox;
+        float y0 = (objects[i].rect.y) / scale_letterbox;
+        float x1 = (objects[i].rect.x + objects[i].rect.width) / scale_letterbox;
+        float y1 = (objects[i].rect.y + objects[i].rect.height) / scale_letterbox;
+        x0 = std::max(std::min(x0, (float)(img.cols - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img.rows - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img.cols - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img.rows - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    draw_objects(img, objects);
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+}
diff --git a/source/api/c_api.c b/source/api/c_api.c
index e79d42f99..4bff60859 100644
--- a/source/api/c_api.c
+++ b/source/api/c_api.c
@@ -54,7 +54,7 @@
 #include <string.h>
 
 #define STR_VERSION2(a) #a
-#define STR_VERSION(a) STR_VERSION2(a)
+#define STR_VERSION(a)  STR_VERSION2(a)
 
 #ifdef TENGINE_LITE_VERSION
 static const char* tengine_lite_version = STR_VERSION(TENGINE_LITE_VERSION);
@@ -70,13 +70,10 @@ static const char* ver_postfix = "dev";
 
 static char* hcl_version = NULL;
 
-
 static int init_flag = 0;
 
-
 //////////////////////////////////////////////////// context about  ////////////////////////////////////////////////////
 
-
 context_t create_context(const char* context_name, int empty_context)
 {
     struct context* context = (struct context*)sys_malloc(sizeof(struct context));
@@ -92,7 +89,6 @@ context_t create_context(const char* context_name, int empty_context)
     return context;
 }
 
-
 void destroy_context(context_t context)
 {
     struct context* ctx = (struct context*)context;
@@ -120,13 +116,11 @@ void destroy_context(context_t context)
     sys_free(ctx);
 }
 
-
 struct context* get_ir_graph_context(struct graph* ir_graph)
 {
     return ir_graph->attribute->context;
 }
 
-
 int get_context_device_number(context_t context)
 {
     struct context* ctx = (struct context*)context;
@@ -138,7 +132,6 @@ int get_context_device_number(context_t context)
     return 0;
 }
 
-
 struct device* get_context_device(context_t context, int index)
 {
     struct context* ctx = (struct context*)context;
@@ -156,7 +149,6 @@ struct device* get_context_device(context_t context, int index)
     return NULL;
 }
 
-
 int add_context_device(context_t context, const char* dev_name)
 {
     struct context* ctx = (struct context*)context;
@@ -184,7 +176,6 @@ int add_context_device(context_t context, const char* dev_name)
     return 0;
 }
 
-
 int set_context_device(context_t context, const char* dev_name, const void* dev_option, size_t dev_opt_size)
 {
     struct context* ctx = (struct context*)context;
@@ -218,7 +209,6 @@ int set_context_device(context_t context, const char* dev_name, const void* dev_
     return 0;
 }
 
-
 int remove_context_device(context_t context, const char* dev_name)
 {
     struct context* ctx = (struct context*)context;
@@ -250,22 +240,18 @@ int remove_context_device(context_t context, const char* dev_name)
     return -1;
 }
 
-
 int set_context_attr(context_t context, const char* attr_name, const void* val, int val_size)
 {
     return -1;
 }
 
-
 int get_context_attr(context_t context, const char* attr_name, void* val, int val_size)
 {
     return -1;
 }
 
-
 ////////////////////////////////////////////////////  engine about  ////////////////////////////////////////////////////
 
-
 const char* get_tengine_version(void)
 {
     static char buf[128];
@@ -277,13 +263,11 @@ const char* get_tengine_version(void)
     return buf;
 }
 
-
 int request_tengine_version(const char* version)
 {
     return 1;
 }
 
-
 int init_tengine(void)
 {
     if (0 != init_flag)
@@ -319,7 +303,6 @@ int init_tengine(void)
     return ret;
 }
 
-
 void release_tengine(void)
 {
     if (0 == init_flag)
@@ -360,17 +343,14 @@ void release_tengine(void)
     init_flag = 0;
 }
 
-
 ////////////////////////////////////////////////////  graph about   ////////////////////////////////////////////////////
 
-
 graph_t create_graph_error(ir_graph_t* graph)
 {
     destroy_graph(graph);
     return NULL;
 }
 
-
 graph_t create_graph(context_t context, const char* model_format, const char* file_name, ...)
 {
     int is_new_context = 0;
@@ -447,19 +427,17 @@ graph_t create_graph(context_t context, const char* model_format, const char* fi
     return ir_graph;
 }
 
-
 int prerun_graph(graph_t graph)
 {
     struct options option;
-    option.num_thread   =  1;
-    option.precision    = -1;
-    option.affinity     = -1;
-    option.cluster      = TENGINE_CLUSTER_BIG;
+    option.num_thread = 1;
+    option.precision = -1;
+    option.affinity = -1;
+    option.cluster = TENGINE_CLUSTER_BIG;
 
     return prerun_graph_multithread(graph, option);
 }
 
-
 int prerun_graph_multithread(graph_t graph, struct options option)
 {
     struct graph* ir_graph = (struct graph*)graph;
@@ -519,9 +497,7 @@ int prerun_graph_multithread(graph_t graph, struct options option)
     }
 
     int precision = TENGINE_MODE_FP32;
-    if (0 <= option.precision && (TENGINE_MODE_FP32 == option.precision || TENGINE_MODE_FP16 == option.precision
-        || TENGINE_MODE_HYBRID_INT8== option.precision ||  TENGINE_MODE_UINT8 == option.precision
-        || TENGINE_MODE_INT8== option.precision))
+    if (0 <= option.precision && (TENGINE_MODE_FP32 == option.precision || TENGINE_MODE_FP16 == option.precision || TENGINE_MODE_HYBRID_INT8 == option.precision || TENGINE_MODE_UINT8 == option.precision || TENGINE_MODE_INT8 == option.precision))
     {
         precision = option.precision;
     }
@@ -529,11 +505,11 @@ int prerun_graph_multithread(graph_t graph, struct options option)
     ctx->default_options = sys_malloc(sizeof(struct cpu_option));
 
     struct cpu_option* opt = (struct cpu_option*)ctx->default_options;
-    opt->dev_name     = CPU_DEVICE_NAME;
-    opt->num_thread   = count;
-    opt->cluster      = TENGINE_CLUSTER_BIG;
-    opt->precision    = precision;
-    opt->affinity     = option.affinity;
+    opt->dev_name = CPU_DEVICE_NAME;
+    opt->num_thread = count;
+    opt->cluster = TENGINE_CLUSTER_BIG;
+    opt->precision = precision;
+    opt->affinity = option.affinity;
 
     struct scheduler* scheduler = ctx->scheduler;
     ret = scheduler->prerun(scheduler, ir_graph);
@@ -566,7 +542,6 @@ int prerun_graph_multithread(graph_t graph, struct options option)
     return 0;
 }
 
-
 int run_graph(graph_t graph, int block)
 {
     struct graph* ir_graph = (struct graph*)graph;
@@ -589,7 +564,6 @@ int run_graph(graph_t graph, int block)
     return 0;
 }
 
-
 int wait_graph(graph_t graph, int try_wait)
 {
     struct graph* ir_graph = (struct graph*)graph;
@@ -614,7 +588,6 @@ int wait_graph(graph_t graph, int try_wait)
     return scheduler->wait(scheduler, ir_graph);
 }
 
-
 int postrun_graph(graph_t graph)
 {
     struct graph* ir_graph = (struct graph*)graph;
@@ -637,7 +610,6 @@ int postrun_graph(graph_t graph)
     return 0;
 }
 
-
 int set_graph_layout(graph_t graph, int layout_type)
 {
     struct graph* ir_graph = (struct graph*)graph;
@@ -652,31 +624,26 @@ int set_graph_layout(graph_t graph, int layout_type)
     return 0;
 }
 
-
 int set_graph_attr(graph_t graph, const char* attr_name, const void* buf, int size)
 {
     return -1;
 }
 
-
 int get_graph_attr(graph_t graph, const char* attr_name, void* buf, int size)
 {
     return -1;
 }
 
-
 int set_graph_thread(graph_t graph, int cluster, int threads)
 {
     return -1;
 }
 
-
 int set_graph_thread_mask(graph_t graph, size_t cpu_mask)
 {
     return -1;
 }
 
-
 int destroy_graph(graph_t graph)
 {
     struct graph* ir_graph = (struct graph*)graph;
@@ -689,13 +656,11 @@ int destroy_graph(graph_t graph)
     return 0;
 }
 
-
 void dump_graph(graph_t graph)
 {
     dump_ir_graph((ir_graph_t*)graph);
 }
 
-
 int set_graph_device(graph_t graph, const char* dev_name)
 {
     struct graph* ir_graph = (struct graph*)graph;
@@ -710,16 +675,14 @@ int set_graph_device(graph_t graph, const char* dev_name)
     return 0;
 }
 
-
 ////////////////////////////////////////////////////   node about   ////////////////////////////////////////////////////
 
-
 int set_graph_input_node(graph_t graph, const char* input_nodes[], int input_number)
 {
     struct graph* ir_graph = (struct graph*)graph;
     int16_t* input_node_indexes;
 
-    input_node_indexes = ( int16_t* )sys_malloc(sizeof(int16_t) * input_number);
+    input_node_indexes = (int16_t*)sys_malloc(sizeof(int16_t) * input_number);
 
     if (input_node_indexes == NULL)
     {
@@ -746,14 +709,13 @@ int set_graph_input_node(graph_t graph, const char* input_nodes[], int input_num
     return ret;
 }
 
-
 int set_graph_output_node(graph_t graph, const char* output_nodes[], int output_number)
 {
     struct graph* ir_graph = (struct graph*)graph;
 
     int16_t* output_node_indexes;
 
-    output_node_indexes = ( int16_t* )sys_malloc(sizeof(int16_t) * output_number);
+    output_node_indexes = (int16_t*)sys_malloc(sizeof(int16_t) * output_number);
 
     if (output_node_indexes == NULL)
     {
@@ -780,18 +742,16 @@ int set_graph_output_node(graph_t graph, const char* output_nodes[], int output_
     return ret;
 }
 
-
 int get_graph_input_node_number(graph_t graph)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     return ir_graph->input_num;
 }
 
-
 node_t get_graph_input_node(graph_t graph, int idx)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     if (idx < 0 || idx >= ir_graph->input_num)
     {
@@ -801,18 +761,16 @@ node_t get_graph_input_node(graph_t graph, int idx)
     return get_ir_graph_node(ir_graph, ir_graph->input_nodes[idx]);
 }
 
-
 int get_graph_output_node_number(graph_t graph)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     return ir_graph->output_num;
 }
 
-
 node_t get_graph_output_node(graph_t graph, int idx)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     if (idx < 0 || idx >= ir_graph->output_num)
     {
@@ -822,10 +780,9 @@ node_t get_graph_output_node(graph_t graph, int idx)
     return get_ir_graph_node(ir_graph, ir_graph->output_nodes[idx]);
 }
 
-
 tensor_t get_graph_input_tensor(graph_t graph, int input_idx, int tensor_idx)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     if (input_idx < 0 || input_idx >= ir_graph->input_num)
     {
@@ -844,10 +801,9 @@ tensor_t get_graph_input_tensor(graph_t graph, int input_idx, int tensor_idx)
     return get_ir_graph_tensor(ir_node->graph, ir_node->output_tensors[tensor_idx]);
 }
 
-
 tensor_t get_graph_output_tensor(graph_t graph, int output_idx, int tensor_idx)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     if (output_idx < 0 || output_idx >= ir_graph->output_num)
     {
@@ -866,10 +822,9 @@ tensor_t get_graph_output_tensor(graph_t graph, int output_idx, int tensor_idx)
     return get_ir_graph_tensor(ir_node->graph, ir_node->output_tensors[tensor_idx]);
 }
 
-
 node_t create_graph_node(graph_t graph, const char* node_name, const char* op_name)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     int node_idx = get_ir_node_index_from_name(ir_graph, node_name);
 
@@ -888,10 +843,9 @@ node_t create_graph_node(graph_t graph, const char* node_name, const char* op_na
     return create_ir_node(ir_graph, node_name, op_type, 1);
 }
 
-
 node_t get_graph_node(graph_t graph, const char* node_name)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     int node_idx = get_ir_node_index_from_name(ir_graph, node_name);
 
@@ -903,10 +857,9 @@ node_t get_graph_node(graph_t graph, const char* node_name)
     return ir_graph->node_list[node_idx];
 }
 
-
 node_t get_graph_node_by_idx(graph_t graph, int idx)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     if (idx < 0 || idx >= ir_graph->node_num)
         return NULL;
@@ -914,34 +867,30 @@ node_t get_graph_node_by_idx(graph_t graph, int idx)
     return ir_graph->node_list[idx];
 }
 
-
 int get_graph_node_num(graph_t graph)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     return ir_graph->node_num;
 }
 
-
 int get_node_output_number(node_t node)
 {
-    struct node* ir_node = ( struct node* )node;
+    struct node* ir_node = (struct node*)node;
 
     return ir_node->output_num;
 }
 
-
 int get_node_input_number(node_t node)
 {
-    struct node* ir_node = ( struct node* )node;
+    struct node* ir_node = (struct node*)node;
 
     return ir_node->input_num;
 }
 
-
 const char* get_node_name(node_t node)
 {
-    struct node* ir_node = ( struct node* )node;
+    struct node* ir_node = (struct node*)node;
 
     if (ir_node->name)
     {
@@ -953,17 +902,15 @@ const char* get_node_name(node_t node)
     return ir_node->name;
 }
 
-
 const char* get_node_op(node_t node)
 {
-    struct node* ir_node = ( struct node* )node;
+    struct node* ir_node = (struct node*)node;
 
     int op_type = ir_node->op.type;
 
     return get_op_name_from_type(op_type);
 }
 
-
 const char* get_node_device(node_t node)
 {
     struct node* ir_node = (struct node*)node;
@@ -989,74 +936,62 @@ const char* get_node_device(node_t node)
     return NULL;
 }
 
-
 int get_node_attr_int(node_t node, const char* attr_name, int* attr_val)
 {
     return -1;
 }
 
-
 int get_node_attr_float(node_t node, const char* attr_name, float* attr_val)
 {
     return -1;
 }
 
-
 int get_node_attr_pointer(node_t node, const char* attr_name, void* attr_val)
 {
     return -1;
 }
 
-
 int get_node_attr_generic(node_t node, const char* attr_name, const char* type_name, void* buf, int size)
 {
     return -1;
 }
 
-
 int set_node_attr_int(node_t node, const char* attr_name, const int* attr_val)
 {
     return -1;
 }
 
-
 int set_node_attr_float(node_t node, const char* attr_name, const float* attr_val)
 {
     return -1;
 }
 
-
 int set_node_attr_pointer(node_t node, const char* attr_name, const void* attr_val)
 {
     return -1;
 }
 
-
 int set_node_attr_generic(node_t node, const char* attr_name, const char* type_name, const void* buf, int size)
 {
     return -1;
 }
 
-
 int add_node_attr(node_t node, const char* attr_name, const char* type_name, int size)
 {
     return -1;
 }
 
-
 void release_graph_node(node_t node)
 {
-    ( void )node;
+    (void)node;
     // NOTHING NEEDS TO DO
 }
 
-
 ////////////////////////////////////////////////////  tensor about  ////////////////////////////////////////////////////
 
-
 tensor_t get_node_input_tensor(node_t node, int input_idx)
 {
-    struct node* ir_node = ( struct node* )node;
+    struct node* ir_node = (struct node*)node;
 
     if (input_idx < 0 || input_idx >= ir_node->input_num)
     {
@@ -1068,7 +1003,7 @@ tensor_t get_node_input_tensor(node_t node, int input_idx)
 
 tensor_t get_node_output_tensor(node_t node, int output_idx)
 {
-    struct node* ir_node = ( struct node* )node;
+    struct node* ir_node = (struct node*)node;
 
     if (output_idx < 0 || output_idx >= ir_node->output_num)
     {
@@ -1078,19 +1013,17 @@ tensor_t get_node_output_tensor(node_t node, int output_idx)
     return get_ir_graph_tensor(ir_node->graph, ir_node->output_tensors[output_idx]);
 }
 
-
 int set_node_input_tensor(node_t node, int input_idx, tensor_t tensor)
 {
-    struct node* ir_node = ( struct node* )node;
+    struct node* ir_node = (struct node*)node;
     struct tensor* ir_tensor = (struct tensor*)tensor;
 
     return set_ir_node_input_tensor(ir_node, input_idx, ir_tensor);
 }
 
-
 int set_node_output_tensor(node_t node, int output_idx, tensor_t tensor, int tensor_type)
 {
-    struct node* ir_node = ( struct node* )node;
+    struct node* ir_node = (struct node*)node;
     struct tensor* ir_tensor = (struct tensor*)tensor;
 
     ir_tensor->tensor_type = tensor_type;
@@ -1098,18 +1031,16 @@ int set_node_output_tensor(node_t node, int output_idx, tensor_t tensor, int ten
     return set_ir_node_output_tensor(ir_node, output_idx, ir_tensor);
 }
 
-
 tensor_t create_graph_tensor(graph_t graph, const char* tensor_name, int data_type)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     return create_ir_tensor(ir_graph, tensor_name, data_type);
 }
 
-
 tensor_t get_graph_tensor(graph_t graph, const char* tensor_name)
 {
-    struct graph* ir_graph = ( struct graph* )graph;
+    struct graph* ir_graph = (struct graph*)graph;
 
     for (int i = 0; i < ir_graph->node_num; i++)
     {
@@ -1124,14 +1055,14 @@ tensor_t get_graph_tensor(graph_t graph, const char* tensor_name)
             {
                 struct tensor* ir_tensor = get_ir_graph_tensor(ir_node->graph, ir_node->input_tensors[j]);
                 if (ir_tensor && ir_tensor->name && !strcmp(ir_tensor->name, tensor_name))
-                    return ( tensor_t )ir_tensor;
+                    return (tensor_t)ir_tensor;
             }
 
             for (int j = 0; j < ir_node->output_num; j++)
             {
                 struct tensor* ir_tensor = get_ir_graph_tensor(ir_node->graph, ir_node->output_tensors[j]);
                 if (ir_tensor && ir_tensor->name && !strcmp(ir_tensor->name, tensor_name))
-                    return ( tensor_t )ir_tensor;
+                    return (tensor_t)ir_tensor;
             }
         }
     }
@@ -1139,7 +1070,6 @@ tensor_t get_graph_tensor(graph_t graph, const char* tensor_name)
     return NULL;
 }
 
-
 const char* get_tensor_name(tensor_t tensor)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1150,13 +1080,11 @@ const char* get_tensor_name(tensor_t tensor)
     return ir_tensor->name;
 }
 
-
 void release_graph_tensor(tensor_t tensor)
 {
     // NOTHING NEEDS TO DO
 }
 
-
 int set_tensor_shape(tensor_t tensor, const int dims[], int dim_number)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1164,7 +1092,6 @@ int set_tensor_shape(tensor_t tensor, const int dims[], int dim_number)
     return set_ir_tensor_shape(ir_tensor, dims, dim_number);
 }
 
-
 int get_tensor_shape(tensor_t tensor, int dims[], int dim_number)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1180,7 +1107,6 @@ int get_tensor_shape(tensor_t tensor, int dims[], int dim_number)
     return ir_tensor->dim_num;
 }
 
-
 int get_tensor_buffer_size(tensor_t tensor)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1188,7 +1114,6 @@ int get_tensor_buffer_size(tensor_t tensor)
     return (int)(ir_tensor->elem_size * ir_tensor->elem_num);
 }
 
-
 void* get_tensor_buffer(tensor_t tensor)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1198,7 +1123,6 @@ void* get_tensor_buffer(tensor_t tensor)
     return ir_tensor->data;
 }
 
-
 int set_tensor_buffer(tensor_t tensor, void* buffer, int buffer_size)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1220,7 +1144,6 @@ int set_tensor_buffer(tensor_t tensor, void* buffer, int buffer_size)
     return 0;
 }
 
-
 int get_tensor_data(tensor_t tensor, void* output_data, int data_size)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1247,7 +1170,6 @@ int get_tensor_data(tensor_t tensor, void* output_data, int data_size)
     return -1;
 }
 
-
 int set_tensor_data(tensor_t tensor, const void* input_data, int data_size)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1268,7 +1190,6 @@ int set_tensor_data(tensor_t tensor, const void* input_data, int data_size)
     return -1;
 }
 
-
 int get_tensor_data_type(tensor_t tensor)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1276,7 +1197,6 @@ int get_tensor_data_type(tensor_t tensor)
     return ir_tensor->data_type;
 }
 
-
 int set_tensor_data_type(tensor_t tensor, int data_type)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1286,7 +1206,6 @@ int set_tensor_data_type(tensor_t tensor, int data_type)
     return 0;
 }
 
-
 int get_tensor_layout(tensor_t tensor)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1294,7 +1213,6 @@ int get_tensor_layout(tensor_t tensor)
     return ir_tensor->layout;
 }
 
-
 int set_tensor_layout(tensor_t tensor, int layout)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1304,7 +1222,6 @@ int set_tensor_layout(tensor_t tensor, int layout)
     return 0;
 }
 
-
 int set_tensor_quant_param(tensor_t tensor, const float* scale, const int* zero_point, int number)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1312,7 +1229,6 @@ int set_tensor_quant_param(tensor_t tensor, const float* scale, const int* zero_
     return set_ir_tensor_quantization_parameter(ir_tensor, scale, zero_point, number);
 }
 
-
 int get_tensor_quant_param(tensor_t tensor, float* scale, int* zero_point, int number)
 {
     struct tensor* ir_tensor = (struct tensor*)tensor;
@@ -1320,63 +1236,52 @@ int get_tensor_quant_param(tensor_t tensor, float* scale, int* zero_point, int n
     return get_ir_tensor_quantization_parameter(ir_tensor, scale, zero_point, number);
 }
 
-
 ////////////////////////////////////////////////////   misc about   ////////////////////////////////////////////////////
 
-
 const char* get_tengine_hcl_version()
 {
     return hcl_version;
 }
 
-
 int set_default_device(const char* device)
 {
     return -1;
 }
 
-
 void set_log_level(enum log_level level)
 {
     SET_LOG_LEVEL(level);
 }
 
-
 void set_log_output(log_print_t func)
 {
     SET_LOG_OUTPUT(func);
 }
 
-
 int get_tengine_errno(void)
 {
     return -1;
 }
 
-
 int clr_tengine_errno(void)
 {
     return -1;
 }
 
-
 size_t get_cluster_affinity_mask(int cluster)
 {
     check_cpu();
     return get_cpu_cluster_mask(cluster);
 }
 
-
 ////////////////////////////////////////////////////  custom about  ////////////////////////////////////////////////////
 
-
 int set_custom_kernel(node_t node, const char* dev_name, struct custom_kernel_ops* kernel_ops)
 {
     // TODO: set custom kernel
     return -1;
 }
 
-
 int remove_custom_kernel(node_t node, const char* dev_name)
 {
     // TODO: remove custom kernel
diff --git a/source/api/c_api.h b/source/api/c_api.h
index c15093191..4e67ef6a1 100644
--- a/source/api/c_api.h
+++ b/source/api/c_api.h
@@ -28,7 +28,6 @@
 #include <stdint.h>
 #include <stddef.h>
 
-
 #if defined __GNUC__
 #define DLLEXPORT __attribute((visibility("default")))
 #elif defined(_MSC_VER)
@@ -37,10 +36,9 @@
 #define DLLEXPORT
 #endif
 
-
 #if defined __GNUC__
 #define DEPRECATED_BEFORE
-#define DEPRECATED_AFTER  __attribute__ ((deprecated))
+#define DEPRECATED_AFTER __attribute__((deprecated))
 #elif defined(_MSC_VER)
 #pragma deprecated()
 #define DEPRECATED_BEFORE __declspec(deprecated)
@@ -50,59 +48,57 @@
 #define DEPRECATED_AFTER
 #endif
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define MAX_SHAPE_DIM_NUM           8
+#define MAX_SHAPE_DIM_NUM 8
 
 /* the data type of the tensor */
-#define TENGINE_DT_FP32             0
-#define TENGINE_DT_FP16             1
-#define TENGINE_DT_INT8             2
-#define TENGINE_DT_UINT8            3
-#define TENGINE_DT_INT32            4
-#define TENGINE_DT_INT16            5
+#define TENGINE_DT_FP32  0
+#define TENGINE_DT_FP16  1
+#define TENGINE_DT_INT8  2
+#define TENGINE_DT_UINT8 3
+#define TENGINE_DT_INT32 4
+#define TENGINE_DT_INT16 5
 
 /* layout type, not real layout */
-#define TENGINE_LAYOUT_NCHW         0
-#define TENGINE_LAYOUT_NHWC         1
+#define TENGINE_LAYOUT_NCHW 0
+#define TENGINE_LAYOUT_NHWC 1
 
 /* tensor type: the content changed or not during inference */
-#define TENSOR_TYPE_UNKNOWN         0
-#define TENSOR_TYPE_VAR             1
-#define TENSOR_TYPE_CONST           2
-#define TENSOR_TYPE_INPUT           3
-#define TENSOR_TYPE_DEP             4
+#define TENSOR_TYPE_UNKNOWN 0
+#define TENSOR_TYPE_VAR     1
+#define TENSOR_TYPE_CONST   2
+#define TENSOR_TYPE_INPUT   3
+#define TENSOR_TYPE_DEP     4
 
 /* cluster type: big-LITTLE and DynamIQ defined */
-#define TENGINE_CLUSTER_ALL         0
-#define TENGINE_CLUSTER_BIG         1
-#define TENGINE_CLUSTER_MEDIUM      2
-#define TENGINE_CLUSTER_LITTLE      3
+#define TENGINE_CLUSTER_ALL    0
+#define TENGINE_CLUSTER_BIG    1
+#define TENGINE_CLUSTER_MEDIUM 2
+#define TENGINE_CLUSTER_LITTLE 3
 
-#define TENGINE_MODE_FP32           0
-#define TENGINE_MODE_FP16           1
-#define TENGINE_MODE_HYBRID_INT8    2
-#define TENGINE_MODE_UINT8          3
-#define TENGINE_MODE_INT8           4
+#define TENGINE_MODE_FP32        0
+#define TENGINE_MODE_FP16        1
+#define TENGINE_MODE_HYBRID_INT8 2
+#define TENGINE_MODE_UINT8       3
+#define TENGINE_MODE_INT8        4
 
 /* node dump action definition */
-#define NODE_DUMP_ACTION_DISABLE    0
-#define NODE_DUMP_ACTION_ENABLE     1
-#define NODE_DUMP_ACTION_START      2
-#define NODE_DUMP_ACTION_STOP       3
-#define NODE_DUMP_ACTION_GET        4
+#define NODE_DUMP_ACTION_DISABLE 0
+#define NODE_DUMP_ACTION_ENABLE  1
+#define NODE_DUMP_ACTION_START   2
+#define NODE_DUMP_ACTION_STOP    3
+#define NODE_DUMP_ACTION_GET     4
 
 /* graph perf action definition */
-#define GRAPH_PERF_STAT_DISABLE     0
-#define GRAPH_PERF_STAT_ENABLE      1
-#define GRAPH_PERF_STAT_STOP        2
-#define GRAPH_PERF_STAT_START       3
-#define GRAPH_PERF_STAT_RESET       4
-#define GRAPH_PERF_STAT_GET         5
-
+#define GRAPH_PERF_STAT_DISABLE 0
+#define GRAPH_PERF_STAT_ENABLE  1
+#define GRAPH_PERF_STAT_STOP    2
+#define GRAPH_PERF_STAT_START   3
+#define GRAPH_PERF_STAT_RESET   4
+#define GRAPH_PERF_STAT_GET     5
 
 /* follow the std. UNIX log level definition */
 enum log_level
@@ -117,7 +113,6 @@ enum log_level
     LOG_DEBUG
 };
 
-
 /* note: Android NN only define one event */
 enum graph_exec_event
 {
@@ -128,7 +123,6 @@ enum graph_exec_event
     GRAPH_EXEC_DONE
 };
 
-
 /* TODO: should add suspend? */
 enum graph_exec_stat
 {
@@ -139,7 +133,6 @@ enum graph_exec_stat
     GRAPH_STAT_ERROR
 };
 
-
 enum device_policy
 {
     DEFAULT_POLICY,
@@ -147,19 +140,15 @@ enum device_policy
     LOW_POWER_POLICY
 };
 
-
 typedef void* context_t;
 typedef void* graph_t;
 typedef void* tensor_t;
 typedef void* node_t;
 
-
 typedef int (*event_handler_t)(graph_t, int, void* arg);
 
-
 typedef void (*log_print_t)(const char*);
 
-
 /* graph exec options */
 typedef struct options
 {
@@ -169,15 +158,14 @@ typedef struct options
     uint64_t affinity;
 } options_t;
 
-
 struct custom_kernel_tensor
 {
     int dim[MAX_SHAPE_DIM_NUM]; /* the shape dim array */
-    int dim_num; /* valid entry number */
+    int dim_num;                /* valid entry number */
     int element_num;
     int element_size; /* determined  by data_type */
     int data_type;
-    int dev_type; /* indicate the tensor belongs to CPU/GPU ... */
+    int dev_type;    /* indicate the tensor belongs to CPU/GPU ... */
     int layout_type; /*  NCHW type or NHWC type*/
 
     /* quant info */
@@ -186,20 +174,19 @@ struct custom_kernel_tensor
     int* zero_point;
     int* quant_number;
 
-    void* data; /* pointer to host memory (virtual address) */
-    void* dev_mem; /* refers to device memory block */
+    void* data;       /* pointer to host memory (virtual address) */
+    void* dev_mem;    /* refers to device memory block */
     void* mapped_mem; /* the mapped address for device memory block */
 };
 
-
 /* For user to add user defined kernel*/
 struct custom_kernel_ops
 {
     const char* kernel_name; /* name of the kernel */
-    const char* op; /* name of the op to be implemented */
-    int force; /* if not set, when bind() failed,
+    const char* op;          /* name of the op to be implemented */
+    int force;               /* if not set, when bind() failed,
       try to use other kernel implementations*/
-    void* kernel_param; /* used for kernel impl functions */
+    void* kernel_param;      /* used for kernel impl functions */
     int kernel_param_size;
 
     /*!
@@ -230,7 +217,7 @@ struct custom_kernel_ops
      * @return the inplace input tensor index for an output tensor.
      *         if the output tensor is not an inplace one, return -1.
      */
-    int (*inplace_info)(struct custom_kernel_ops* ops, int output_idx);    // optional
+    int (*inplace_info)(struct custom_kernel_ops* ops, int output_idx); // optional
 
     /*!
      * @brief Check if the kernel can work on the input and output shapes.
@@ -321,7 +308,6 @@ struct custom_kernel_ops
     void (*release)(struct custom_kernel_ops* ops);
 };
 
-
 /************** Library intialization and version checking *******************/
 
 /*!
@@ -1122,7 +1108,6 @@ DLLEXPORT DEPRECATED_BEFORE const char* get_node_device(node_t node) DEPRECATED_
  */
 DLLEXPORT const char* get_default_device(void);
 
-
 /******************** execution context *****************************/
 
 /*!
diff --git a/source/api/plugin.c b/source/api/plugin.c
index c39c4bfae..019ceb1c0 100644
--- a/source/api/plugin.c
+++ b/source/api/plugin.c
@@ -39,13 +39,12 @@
 #endif
 
 #ifdef _MSC_VER
-typedef int(*fun_ptr)(void);
+typedef int (*fun_ptr)(void);
 typedef HINSTANCE so_handle_t;
 #else
 typedef void* so_handle_t;
 #endif
 
-
 struct plugin_header
 {
     char* name;
@@ -55,7 +54,6 @@ struct plugin_header
 
 static struct vector* plugin_list = NULL;
 
-
 static int exec_so_func(so_handle_t handle, const char* func_name)
 {
 #ifdef _MSC_VER
@@ -87,7 +85,6 @@ static int exec_so_func(so_handle_t handle, const char* func_name)
     return 0;
 }
 
-
 int load_tengine_plugin(const char* plugin_name, const char* file_name, const char* init_func_name)
 {
     struct plugin_header header;
@@ -138,7 +135,6 @@ int load_tengine_plugin(const char* plugin_name, const char* file_name, const ch
     /* execute the init function */
     if (init_func_name && exec_so_func(header.handle, init_func_name) < 0)
     {
-
 #ifdef _MSC_VER
         FreeLibrary(header.handle);
 #else
@@ -162,7 +158,6 @@ int load_tengine_plugin(const char* plugin_name, const char* file_name, const ch
     return 0;
 }
 
-
 int unload_tengine_plugin(const char* plugin_name, const char* rel_func_name)
 {
     if (plugin_list == NULL)
@@ -206,7 +201,6 @@ int unload_tengine_plugin(const char* plugin_name, const char* rel_func_name)
     return 0;
 }
 
-
 int get_tengine_plugin_number(void)
 {
     int plugin_num = 0;
@@ -217,7 +211,6 @@ int get_tengine_plugin_number(void)
     return plugin_num;
 }
 
-
 const char* get_tengine_plugin_name(int idx)
 {
     int plugin_num = get_tengine_plugin_number();
diff --git a/source/device/acl/acl_define.h b/source/device/acl/acl_define.h
index 60f70e2e9..184dc0f68 100644
--- a/source/device/acl/acl_define.h
+++ b/source/device/acl/acl_define.h
@@ -26,9 +26,8 @@
 
 #define ACL_DEV_NAME "ACL"
 
-
 typedef struct acl_option
 {
     char* dev_name;
-    int precision;      //!< precision of calculation
+    int precision; //!< precision of calculation
 } acl_opt_t;
diff --git a/source/device/acl/acl_device.hpp b/source/device/acl/acl_device.hpp
index 247b36db1..d8900de8e 100644
--- a/source/device/acl/acl_device.hpp
+++ b/source/device/acl/acl_device.hpp
@@ -26,8 +26,7 @@
 
 #include "acl_define.h"
 
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "device/device.h"
 
diff --git a/source/device/acl/acl_executor.hpp b/source/device/acl/acl_executor.hpp
index ea5f504f8..0574b9e38 100644
--- a/source/device/acl/acl_executor.hpp
+++ b/source/device/acl/acl_executor.hpp
@@ -38,8 +38,7 @@
 
 #include <arm_neon.h>
 
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "device/device.h"
 #include "graph/tensor.h"
@@ -53,7 +52,6 @@ extern "C"
 #include "utility/log.h"
 }
 
-
 #define MAX_TENGINE_DATA_TYPE_NUM 6
 static const int gs32TengineDataElemetSize[MAX_TENGINE_DATA_TYPE_NUM] = {4, 2, 1, 1, 4, 2};
 
@@ -65,7 +63,7 @@ using namespace arm_compute;
 #define dynamic_cast static_cast
 #endif
 
-template <typename T>
+template<typename T>
 inline void _PermuteDataLayoutNCHWToNHWCInter(T* pvData, int n, int c, int h, int w, T* pvOutputData);
 void _PermuteDataLayoutNCHWToNHWC(void* pvData, int n, int c, int h, int w, void* pvOutputData, int DataEleSize);
 void copy_buffer(void* dest, const void* src, const int src_len, DataType dest_type, DataType src_type);
@@ -77,10 +75,9 @@ class CLGraph
     ~CLGraph();
 
     void init(std::string name, DataType type);
-    int prerun(struct subgraph *subgraph, struct acl_option* option);
-    int run(struct subgraph *subgraph);
-    int postrun(struct subgraph *subgraph);
-
+    int prerun(struct subgraph* subgraph, struct acl_option* option);
+    int run(struct subgraph* subgraph);
+    int postrun(struct subgraph* subgraph);
 
 private:
     bool CreateACLGraph(struct subgraph* subgraph, DataType type, bool bDataLayoutOpFlag = false);
@@ -106,7 +103,7 @@ class CLGraph
 
 public:
     std::string name_;
-    std::vector<std::shared_ptr<IFunction>> functions_map_;
+    std::vector<std::shared_ptr<IFunction> > functions_map_;
     std::unordered_map<std::string, CLTensor*> tensors_map_;
     DataType data_type_;
 
diff --git a/source/device/acl/acl_graph.hpp b/source/device/acl/acl_graph.hpp
index dc070509e..be40c0fa6 100644
--- a/source/device/acl/acl_graph.hpp
+++ b/source/device/acl/acl_graph.hpp
@@ -24,15 +24,13 @@
 
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
 #include "graph/subgraph.h"
 #include "device/device.h"
 
-
 int acl_dev_init(struct device* dev);
 int acl_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options);
 int acl_dev_run(struct device* dev, struct subgraph* subgraph);
diff --git a/source/device/acl/acl_limit.hpp b/source/device/acl/acl_limit.hpp
index c09b1f6b0..0b08d3268 100644
--- a/source/device/acl/acl_limit.hpp
+++ b/source/device/acl/acl_limit.hpp
@@ -22,35 +22,33 @@
  * Author: hhchen@openailab.com
  */
 
-
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "operator/op.h"
 }
 
 const int acl_supported_ops[] = {
-        OP_BATCHNORM,
-        OP_CAST,
-        OP_CLIP,
-        OP_CONCAT,
-        OP_CONST,
-        OP_CONV,
-        OP_CROP,
-        OP_DECONV,
-        OP_DROPOUT,
-        OP_ELTWISE,
-        OP_FC,
-        //OP_FLATTEN,
-        OP_INPUT,
-        OP_INTERP,
-        //OP_PERMUTE,
-        OP_POOL,
-        OP_RELU,
-        OP_RESHAPE,
-        OP_RESIZE,
-        //OP_SLICE,
-        OP_SOFTMAX
-        //OP_BIAS,
+    OP_BATCHNORM,
+    OP_CAST,
+    OP_CLIP,
+    OP_CONCAT,
+    OP_CONST,
+    OP_CONV,
+    OP_CROP,
+    OP_DECONV,
+    OP_DROPOUT,
+    OP_ELTWISE,
+    OP_FC,
+    //OP_FLATTEN,
+    OP_INPUT,
+    OP_INTERP,
+    //OP_PERMUTE,
+    OP_POOL,
+    OP_RELU,
+    OP_RESHAPE,
+    OP_RESIZE,
+    //OP_SLICE,
+    OP_SOFTMAX
+    //OP_BIAS,
 };
diff --git a/source/device/cpu/cpu_define.h b/source/device/cpu/cpu_define.h
index dfc4ac09f..39ea017fe 100644
--- a/source/device/cpu/cpu_define.h
+++ b/source/device/cpu/cpu_define.h
@@ -26,29 +26,28 @@
 
 #include <stddef.h>
 
-#define OPS_SCORE_STATIC            10000
-#define OPS_SCORE_BEST              8000
-#define OPS_SCORE_PREFER            6000
-#define OPS_SCORE_CANDO             4000
-#define OPS_SCORE_NOTSUP            2000
+#define OPS_SCORE_STATIC 10000
+#define OPS_SCORE_BEST   8000
+#define OPS_SCORE_PREFER 6000
+#define OPS_SCORE_CANDO  4000
+#define OPS_SCORE_NOTSUP 2000
 
-#define MEM_POOL_ALLOCATED          8
-#define INPLACE_BLOCK_FLAG          0x40
+#define MEM_POOL_ALLOCATED 8
+#define INPLACE_BLOCK_FLAG 0x40
 
-#define CPU_DEVICE_NAME             "CPU"
-
-#define TENGINE_DUMP_DIR            "TG_DEBUG_DUMP_DIR"
-#define TENGINE_DUMP_LAYER          "TG_DEBUG_DATA"
-#define TENGINE_DUMP_GRAPH          "TG_DEBUG_GRAPH"
-#define TENGINE_PRINT_LAYER_COST    "TG_DEBUG_TIME"
-#define TENGINE_FORCE_USE_REF_OP    "TG_DEBUG_REF"
+#define CPU_DEVICE_NAME "CPU"
 
+#define TENGINE_DUMP_DIR         "TG_DEBUG_DUMP_DIR"
+#define TENGINE_DUMP_LAYER       "TG_DEBUG_DATA"
+#define TENGINE_DUMP_GRAPH       "TG_DEBUG_GRAPH"
+#define TENGINE_PRINT_LAYER_COST "TG_DEBUG_TIME"
+#define TENGINE_FORCE_USE_REF_OP "TG_DEBUG_REF"
 
 typedef struct cpu_option
 {
     const char* dev_name;
-    int         num_thread;     //!< how many threads to run
-    int         cluster;        //!< cpu cluster
-    int         precision;      //!< precision of calculation
-    size_t      affinity;       //!< affinity of cpu core, max 64 cpus
+    int num_thread;  //!< how many threads to run
+    int cluster;     //!< cpu cluster
+    int precision;   //!< precision of calculation
+    size_t affinity; //!< affinity of cpu core, max 64 cpus
 } cpu_opt_t;
diff --git a/source/device/cpu/cpu_device.c b/source/device/cpu/cpu_device.c
index ad00395e1..1c8270fa6 100644
--- a/source/device/cpu/cpu_device.c
+++ b/source/device/cpu/cpu_device.c
@@ -47,21 +47,18 @@
 
 #include <string.h>
 
-
 int init_cpu(struct device* device)
 {
     (void)device;
     return register_all_cpu_ops();
 }
 
-
 int release_cpu(struct device* device)
 {
     (void)device;
     return unregister_all_cpu_ops();
 }
 
-
 static int prerun(struct device* dev, struct subgraph* subgraph, void* option)
 {
     struct exec_graph* exec_graph;
@@ -92,13 +89,11 @@ static int prerun(struct device* dev, struct subgraph* subgraph, void* option)
         exec_graph->timer = NULL;
     }
 
-
     subgraph->device_graph = exec_graph;
 
     return 0;
 }
 
-
 static int run(struct device* dev, struct subgraph* subgraph)
 {
     struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph;
@@ -113,7 +108,7 @@ static int run(struct device* dev, struct subgraph* subgraph)
 
     for (int i = 0; i < node_num; i++)
     {
-        struct exec_node* node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i);
+        struct exec_node* node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i);
         struct node_ops* node_ops = node->node_ops;
 
         /* TODO: handle the shape changed  and dynamic shape case */
@@ -167,7 +162,7 @@ static int run(struct device* dev, struct subgraph* subgraph)
             struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, node->ir_node->input_tensors[j]);
             if (input_tensor->dim_num <= 5)
             {
-                char dir_str[32] = { 0 };
+                char dir_str[32] = {0};
                 sprintf(dir_str, "in[%d]", j);
 
                 if (NULL != input_tensor->data)
@@ -183,7 +178,7 @@ static int run(struct device* dev, struct subgraph* subgraph)
             /* debug */
             if (output_tensor->dim_num <= 5)
             {
-                char dir_str[32] = { 0 };
+                char dir_str[32] = {0};
                 sprintf(dir_str, "out[%d]", j);
 
                 extract_feature_from_tensor(dir_str, name, output_tensor);
@@ -225,7 +220,6 @@ static int run(struct device* dev, struct subgraph* subgraph)
     return 0;
 }
 
-
 static int postrun(struct device* dev, struct subgraph* subgraph)
 {
     struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph;
@@ -234,7 +228,7 @@ static int postrun(struct device* dev, struct subgraph* subgraph)
 
     for (int i = 0; i < node_num; i++)
     {
-        struct exec_node* node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i);
+        struct exec_node* node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i);
         struct node_ops* node_ops = node->node_ops;
 
         if (exec_graph->timer)
@@ -255,7 +249,6 @@ static int postrun(struct device* dev, struct subgraph* subgraph)
     return 0;
 }
 
-
 static int cpu_dev_release_exec_graph(struct device* dev, void* exec_graph)
 {
     if (NULL != exec_graph)
@@ -266,7 +259,6 @@ static int cpu_dev_release_exec_graph(struct device* dev, void* exec_graph)
     return 0;
 }
 
-
 static int cpu_allocate(struct device* device, struct subgraph* sub_graph)
 {
     /* set the correct input wait count: INPUT tensor is always ready */
@@ -283,7 +275,6 @@ static int cpu_allocate(struct device* device, struct subgraph* sub_graph)
     return 0;
 }
 
-
 static int cpu_describe(struct device* device, struct vector* allowed_ops, struct vector* blocked_ops, struct vector* precision)
 {
     if (NULL == device)
@@ -317,7 +308,6 @@ static int cpu_describe(struct device* device, struct vector* allowed_ops, struc
     return 0;
 }
 
-
 static int cpu_evaluation(struct device* device, struct subgraph* sub_graph, struct vector* tensor, struct vector* node)
 {
     if (NULL == device)
@@ -332,7 +322,6 @@ static int cpu_evaluation(struct device* device, struct subgraph* sub_graph, str
     return 0;
 }
 
-
 static int cpu_release(struct device* device, struct subgraph* sub_graph)
 {
     if (NULL == device)
@@ -345,7 +334,6 @@ static int cpu_release(struct device* device, struct subgraph* sub_graph)
     return 0;
 }
 
-
 int cpu_split_graph(struct graph* ir_graph)
 {
     struct device* default_device = find_default_device();
@@ -390,47 +378,42 @@ int cpu_split_graph(struct graph* ir_graph)
     return 0;
 }
 
-
 static struct interface cpu_interface = {
-        .init           = init_cpu,
-        .pre_run        = prerun,
-        .run            = run,
-        .post_run       = postrun,
-        .async_run      = NULL,
-        .async_wait     = NULL,
-        .release_graph  = cpu_dev_release_exec_graph,
-        .release_device = release_cpu,
+    .init = init_cpu,
+    .pre_run = prerun,
+    .run = run,
+    .post_run = postrun,
+    .async_run = NULL,
+    .async_wait = NULL,
+    .release_graph = cpu_dev_release_exec_graph,
+    .release_device = release_cpu,
 };
 
-
 static struct allocator cpu_allocator = {
-        .describe       = cpu_describe,
-        .evaluation     = cpu_evaluation,
-        .allocate       = cpu_allocate,
-        .release        = cpu_release,
+    .describe = cpu_describe,
+    .evaluation = cpu_evaluation,
+    .allocate = cpu_allocate,
+    .release = cpu_release,
 };
 
-
 static struct optimizer cpu_optimizer = {
-        .split_graph    = cpu_split_graph,
-        .optimize_graph = NULL,
+    .split_graph = cpu_split_graph,
+    .optimize_graph = NULL,
 };
 
-
 static struct cpu_device cpu_dev = {
-        .base = {
-                .name       = CPU_DEVICE_NAME,
-                .interface  = &cpu_interface,
-                .allocator  = &cpu_allocator,
-                .optimizer  = &cpu_optimizer,
-                .scheduler  = NULL,
-                .privacy    = NULL,
-        },
-        .master_cpu         = 0,
-        .cpu_model          = 0,
+    .base = {
+        .name = CPU_DEVICE_NAME,
+        .interface = &cpu_interface,
+        .allocator = &cpu_allocator,
+        .optimizer = &cpu_optimizer,
+        .scheduler = NULL,
+        .privacy = NULL,
+    },
+    .master_cpu = 0,
+    .cpu_model = 0,
 };
 
-
 int register_cpu_device(void)
 {
 #ifdef TENGINE_AUTO_LOAD_HCL
@@ -448,7 +431,6 @@ int register_cpu_device(void)
     return 0;
 }
 
-
 int unregister_cpu_device(void)
 {
     int ret = unregister_device(&cpu_dev.base);
diff --git a/source/device/cpu/cpu_device.h b/source/device/cpu/cpu_device.h
index 4f717e98e..d39a44dd9 100644
--- a/source/device/cpu/cpu_device.h
+++ b/source/device/cpu/cpu_device.h
@@ -28,11 +28,9 @@
 
 #include "device/device.h"
 
-
 struct node_ops;
 struct node;
 
-
 struct cpu_device
 {
     struct device base;
@@ -40,5 +38,4 @@ struct cpu_device
     uint8_t cpu_model;
 };
 
-
 int register_cpu_device(void);
diff --git a/source/device/cpu/cpu_dump.c b/source/device/cpu/cpu_dump.c
index 2cce834af..c29a7ca83 100644
--- a/source/device/cpu/cpu_dump.c
+++ b/source/device/cpu/cpu_dump.c
@@ -52,7 +52,6 @@
 #include <sys/time.h>
 #endif
 
-
 char* replace_string_character(const char* src_str, char* dst_str, const char* target_char, const char* replaced_char)
 {
     const char* p;
@@ -82,40 +81,41 @@ char* replace_string_character(const char* src_str, char* dst_str, const char* t
     return dst_str;
 }
 
-
 int get_tensor_cv_shape(const struct tensor* tensor, int* n, int* c, int* h, int* w)
 {
-    if (NULL == tensor || NULL == n || NULL == c || NULL == h || NULL ==w)
+    if (NULL == tensor || NULL == n || NULL == c || NULL == h || NULL == w)
     {
         return -1;
     }
 
-    *n = 0; *c = 0; *h = 0; *w = 0;
+    *n = 0;
+    *c = 0;
+    *h = 0;
+    *w = 0;
     const int* dims = tensor->dims;
 
     switch (tensor->dim_num)
     {
-        case 4:
-            *n = dims[0];
-            *c = dims[1];
-            *h = dims[2];
-            *w = dims[3];
-            break;
-        case 3:
-            *n = dims[0];
-            *h = dims[1];
-            *w = dims[2];
-        case 2:
-            *n = dims[0];
-            *w = dims[1];
-        default:
-            return -1;
+    case 4:
+        *n = dims[0];
+        *c = dims[1];
+        *h = dims[2];
+        *w = dims[3];
+        break;
+    case 3:
+        *n = dims[0];
+        *h = dims[1];
+        *w = dims[2];
+    case 2:
+        *n = dims[0];
+        *w = dims[1];
+    default:
+        return -1;
     }
 
     return 0;
 }
 
-
 float get_node_total_flops(struct node* node)
 {
     float flops = 0.f;
@@ -151,214 +151,132 @@ float get_node_total_flops(struct node* node)
     return flops;
 }
 
-
 int print_tensor_data_value(FILE* file, const struct tensor* tensor, int offset)
 {
     switch (tensor->data_type)
     {
-        case TENGINE_DT_FP32:
-        {
-            float* base_ptr = (float*)tensor->data;
-            float val = base_ptr[offset];
-            if (val < 0)
-                fprintf(file, "%.4f ", val);
-            else
-                fprintf(file, " %.4f ", val);
-            break;
-        }
-        case TENGINE_DT_FP16:
-        {
-            fp16_t* base_ptr = (fp16_t*)tensor->data;
-            fp16_t val = base_ptr[offset];
-
-            float val_fp32 = fp16_to_fp32(val);
-
-            if (val_fp32 < 0)
-                fprintf(file, "%.4f ", val_fp32);
-            else
-                fprintf(file, " %.4f ", val_fp32);
-            break;
-        }
-        case TENGINE_DT_UINT8:
-        {
-            uint8_t* base_ptr = (uint8_t*)tensor->data;
-            uint8_t val = base_ptr[offset];
+    case TENGINE_DT_FP32:
+    {
+        float* base_ptr = (float*)tensor->data;
+        float val = base_ptr[offset];
+        if (val < 0)
+            fprintf(file, "%.4f ", val);
+        else
+            fprintf(file, " %.4f ", val);
+        break;
+    }
+    case TENGINE_DT_FP16:
+    {
+        fp16_t* base_ptr = (fp16_t*)tensor->data;
+        fp16_t val = base_ptr[offset];
 
-            float scale = tensor->scale;
-            int32_t zero_point = tensor->zero_point;
+        float val_fp32 = fp16_to_fp32(val);
 
-            float val_fp32 = (float)((int)val - (int)zero_point) * scale;
-            if (val_fp32 < 0)
-                fprintf(file, "%.4f ", val_fp32);
-            else
-                fprintf(file, " %.4f ", val_fp32);
-            break;
-        }
-        case TENGINE_DT_INT8:
-        {
-            int8_t * base_ptr = (int8_t*)tensor->data;
-            int8_t val = base_ptr[offset];
+        if (val_fp32 < 0)
+            fprintf(file, "%.4f ", val_fp32);
+        else
+            fprintf(file, " %.4f ", val_fp32);
+        break;
+    }
+    case TENGINE_DT_UINT8:
+    {
+        uint8_t* base_ptr = (uint8_t*)tensor->data;
+        uint8_t val = base_ptr[offset];
+
+        float scale = tensor->scale;
+        int32_t zero_point = tensor->zero_point;
+
+        float val_fp32 = (float)((int)val - (int)zero_point) * scale;
+        if (val_fp32 < 0)
+            fprintf(file, "%.4f ", val_fp32);
+        else
+            fprintf(file, " %.4f ", val_fp32);
+        break;
+    }
+    case TENGINE_DT_INT8:
+    {
+        int8_t* base_ptr = (int8_t*)tensor->data;
+        int8_t val = base_ptr[offset];
 
-            float scale = tensor->scale;
+        float scale = tensor->scale;
 
-            float val_fp32 = (float)val * scale;
-            if (val_fp32 < 0)
-                fprintf(file, "%.4f ", val_fp32);
-            else
-                fprintf(file, " %.4f ", val_fp32);
-        }
-        case TENGINE_DT_INT32:
-        {
-            int32_t* base_ptr = (int32_t*)tensor->data;
-            int8_t val = base_ptr[offset];
+        float val_fp32 = (float)val * scale;
+        if (val_fp32 < 0)
+            fprintf(file, "%.4f ", val_fp32);
+        else
+            fprintf(file, " %.4f ", val_fp32);
+    }
+    case TENGINE_DT_INT32:
+    {
+        int32_t* base_ptr = (int32_t*)tensor->data;
+        int8_t val = base_ptr[offset];
 
-            float scale = tensor->scale;
-            float val_fp32 = (float)val * scale;
+        float scale = tensor->scale;
+        float val_fp32 = (float)val * scale;
 
-            if (val_fp32 < 0)
-                fprintf(file, "%.6f ", val_fp32);
-            else
-                fprintf(file, " %.6f ", val_fp32);
-        }
+        if (val_fp32 < 0)
+            fprintf(file, "%.6f ", val_fp32);
+        else
+            fprintf(file, " %.6f ", val_fp32);
+    }
     }
 
     return 0;
 }
 
-
 void print_tensor_data_to_file(FILE* file, const struct tensor* tensor)
 {
     switch (tensor->dim_num)
     {
-        case 5:
-        {
-            int dim5 = tensor->dims[0], batch = tensor->dims[1], channel = 0, height = 0, width = 0;
-
-            if (TENGINE_LAYOUT_NCHW == tensor->layout)
-            {
-                channel = tensor->dims[2];
-                height = tensor->dims[3];
-                width = tensor->dims[4];
-            }
-            if (TENGINE_LAYOUT_NHWC == tensor->layout)
-            {
-                height = tensor->dims[2];
-                width = tensor->dims[3];
-                channel = tensor->dims[4];
-            }
-
-            if (TENGINE_DT_FP32 == tensor->data_type)
-            {
-                fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp32\n", dim5, batch, channel, height, width);
-            }
-            else
-            {
-                if (TENGINE_DT_FP16 == tensor->data_type)
-                {
-                    fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp16, cast to fp32\n", dim5, batch, channel, height, width);
-                }
-                else
-                {
-                    const char* type_name = get_tensor_data_type_string(tensor->data_type);
-                    fprintf(file, "Shape is {%d %d %d %d %d}, data type is %s, inverse quantization to fp32\n", dim5, batch, channel, height, width, type_name);
-                }
-            }
-
-            for (int d5 = 0; d5 < dim5; d5++)
-            {
-                fprintf(file, "Dim5 %d:\n", d5);
-
-                for (int n = 0; n < batch; n++)
-                {
-                    fprintf(file, "\tBatch %d:\n", n);
-
-                    for (int ch = 0; ch < channel; ch++)
-                    {
-                        fprintf(file, "\t\tChannel %d:\n", ch);
-
-                        for (int h = 0; h < height; h++)
-                        {
-                            fprintf(file, "\t\t\t");
-
-                            for (int w = 0; w < width; w++)
-                            {
-                                int offset = 0;
-
-                                if (TENGINE_LAYOUT_NCHW == tensor->layout)
-                                {
-                                    offset += d5 * batch * channel * height * width;
-                                    offset += n * channel * height * width;
-                                    offset += ch * height * width;
-                                    offset += h * width;
-                                    offset += w;
-                                }
-                                if (TENGINE_LAYOUT_NHWC == tensor->layout)
-                                {
-                                    offset += d5 * batch * channel * height * width;
-                                    offset += n * channel * height * width;
-                                    offset += ch;
-                                    offset += h * width * channel;
-                                    offset += w * channel;
-                                }
-
-                                print_tensor_data_value(file, tensor, offset);
-                            }
-                            fprintf(file, "\n");
-                        }
-                        fprintf(file, "\n");
-                    }
-                    fprintf(file, "\n");
-                }
-                fprintf(file, "\n");
-            }
+    case 5:
+    {
+        int dim5 = tensor->dims[0], batch = tensor->dims[1], channel = 0, height = 0, width = 0;
 
-            break;
+        if (TENGINE_LAYOUT_NCHW == tensor->layout)
+        {
+            channel = tensor->dims[2];
+            height = tensor->dims[3];
+            width = tensor->dims[4];
         }
-        case 4:
+        if (TENGINE_LAYOUT_NHWC == tensor->layout)
         {
-            int batch = tensor->dims[0], channel = 0, height = 0, width = 0;
-
-            if (TENGINE_LAYOUT_NCHW == tensor->layout)
-            {
-                channel = tensor->dims[1];
-                height = tensor->dims[2];
-                width = tensor->dims[3];
-            }
-            if (TENGINE_LAYOUT_NHWC == tensor->layout)
-            {
-                height = tensor->dims[1];
-                width = tensor->dims[2];
-                channel = tensor->dims[3];
-            }
+            height = tensor->dims[2];
+            width = tensor->dims[3];
+            channel = tensor->dims[4];
+        }
 
-            if (TENGINE_DT_FP32 == tensor->data_type)
+        if (TENGINE_DT_FP32 == tensor->data_type)
+        {
+            fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp32\n", dim5, batch, channel, height, width);
+        }
+        else
+        {
+            if (TENGINE_DT_FP16 == tensor->data_type)
             {
-                fprintf(file, "Shape is {%d %d %d %d}, data type is fp32\n", batch, channel, height, width);
+                fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp16, cast to fp32\n", dim5, batch, channel, height, width);
             }
             else
             {
-                if (TENGINE_DT_FP16 == tensor->data_type)
-                {
-                    fprintf(file, "Shape is {%d %d %d %d}, data type is fp16, cast to fp32\n", batch, channel, height, width);
-                }
-                else
-                {
-                    const char* type_name = get_tensor_data_type_string(tensor->data_type);
-                    fprintf(file, "Shape is {%d %d %d %d}, data type is %s, inverse quantization to fp32\n", batch, channel, height, width, type_name);
-                }
+                const char* type_name = get_tensor_data_type_string(tensor->data_type);
+                fprintf(file, "Shape is {%d %d %d %d %d}, data type is %s, inverse quantization to fp32\n", dim5, batch, channel, height, width, type_name);
             }
+        }
+
+        for (int d5 = 0; d5 < dim5; d5++)
+        {
+            fprintf(file, "Dim5 %d:\n", d5);
 
             for (int n = 0; n < batch; n++)
             {
-                fprintf(file, "Batch %d:\n", n);
+                fprintf(file, "\tBatch %d:\n", n);
 
                 for (int ch = 0; ch < channel; ch++)
                 {
-                    fprintf(file, "\tChannel %d:\n", ch);
+                    fprintf(file, "\t\tChannel %d:\n", ch);
 
                     for (int h = 0; h < height; h++)
                     {
-                        fprintf(file, "\t\t");
+                        fprintf(file, "\t\t\t");
 
                         for (int w = 0; w < width; w++)
                         {
@@ -366,6 +284,7 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor)
 
                             if (TENGINE_LAYOUT_NCHW == tensor->layout)
                             {
+                                offset += d5 * batch * channel * height * width;
                                 offset += n * channel * height * width;
                                 offset += ch * height * width;
                                 offset += h * width;
@@ -373,6 +292,7 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor)
                             }
                             if (TENGINE_LAYOUT_NHWC == tensor->layout)
                             {
+                                offset += d5 * batch * channel * height * width;
                                 offset += n * channel * height * width;
                                 offset += ch;
                                 offset += h * width * channel;
@@ -387,49 +307,56 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor)
                 }
                 fprintf(file, "\n");
             }
+            fprintf(file, "\n");
+        }
+
+        break;
+    }
+    case 4:
+    {
+        int batch = tensor->dims[0], channel = 0, height = 0, width = 0;
 
-            break;
+        if (TENGINE_LAYOUT_NCHW == tensor->layout)
+        {
+            channel = tensor->dims[1];
+            height = tensor->dims[2];
+            width = tensor->dims[3];
         }
-        case 3:
+        if (TENGINE_LAYOUT_NHWC == tensor->layout)
         {
-            int batch = 0, height = 0, width = 0;
-
-            if (TENGINE_LAYOUT_NCHW == tensor->layout)
-            {
-                batch = tensor->dims[0];
-                height = tensor->dims[1];
-                width = tensor->dims[2];
-            }
-            if (TENGINE_LAYOUT_NHWC == tensor->layout)
-            {
-                height = tensor->dims[0];
-                width = tensor->dims[1];
-                batch = tensor->dims[2];
-            }
+            height = tensor->dims[1];
+            width = tensor->dims[2];
+            channel = tensor->dims[3];
+        }
 
-            if (TENGINE_DT_FP32 == tensor->data_type)
+        if (TENGINE_DT_FP32 == tensor->data_type)
+        {
+            fprintf(file, "Shape is {%d %d %d %d}, data type is fp32\n", batch, channel, height, width);
+        }
+        else
+        {
+            if (TENGINE_DT_FP16 == tensor->data_type)
             {
-                fprintf(file, "Shape is {%d %d %d}, data type is fp32\n", batch, height, width);
+                fprintf(file, "Shape is {%d %d %d %d}, data type is fp16, cast to fp32\n", batch, channel, height, width);
             }
             else
             {
-                if (TENGINE_DT_FP16 == tensor->data_type)
-                {
-                    fprintf(file, "Shape is {%d %d %d}, data type is fp16, cast to fp32\n", batch, height, width);
-                }
-                else
-                {
-                    const char* type_name = get_tensor_data_type_string(tensor->data_type);
-                    fprintf(file, "Shape is {%d %d %d}, data type is %s, inverse quantization to fp32\n", batch, height, width, type_name);
-                }
+                const char* type_name = get_tensor_data_type_string(tensor->data_type);
+                fprintf(file, "Shape is {%d %d %d %d}, data type is %s, inverse quantization to fp32\n", batch, channel, height, width, type_name);
             }
+        }
 
-            for (int n = 0; n < batch; n++)
+        for (int n = 0; n < batch; n++)
+        {
+            fprintf(file, "Batch %d:\n", n);
+
+            for (int ch = 0; ch < channel; ch++)
             {
+                fprintf(file, "\tChannel %d:\n", ch);
+
                 for (int h = 0; h < height; h++)
                 {
-                    fprintf(file, "Channel %d:\n", h);
-                    fprintf(file, "\t");
+                    fprintf(file, "\t\t");
 
                     for (int w = 0; w < width; w++)
                     {
@@ -437,15 +364,17 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor)
 
                         if (TENGINE_LAYOUT_NCHW == tensor->layout)
                         {
-                            offset += n * height * width;
+                            offset += n * channel * height * width;
+                            offset += ch * height * width;
                             offset += h * width;
                             offset += w;
                         }
                         if (TENGINE_LAYOUT_NHWC == tensor->layout)
                         {
-                            offset += h;
-                            offset += n * width * height;
-                            offset += w * height;
+                            offset += n * channel * height * width;
+                            offset += ch;
+                            offset += h * width * channel;
+                            offset += w * channel;
                         }
 
                         print_tensor_data_value(file, tensor, offset);
@@ -454,86 +383,153 @@ void print_tensor_data_to_file(FILE* file, const struct tensor* tensor)
                 }
                 fprintf(file, "\n");
             }
+            fprintf(file, "\n");
+        }
 
-            break;
+        break;
+    }
+    case 3:
+    {
+        int batch = 0, height = 0, width = 0;
+
+        if (TENGINE_LAYOUT_NCHW == tensor->layout)
+        {
+            batch = tensor->dims[0];
+            height = tensor->dims[1];
+            width = tensor->dims[2];
         }
-        case 2:
+        if (TENGINE_LAYOUT_NHWC == tensor->layout)
         {
-            int batch = 0, width = 0;
-
-            if (TENGINE_LAYOUT_NCHW == tensor->layout)
-            {
-                batch = tensor->dims[0];
-                width = tensor->dims[1];
-            }
-            if (TENGINE_LAYOUT_NHWC == tensor->layout)
-            {
-                batch = tensor->dims[0];
-                width = tensor->dims[1];
-            }
+            height = tensor->dims[0];
+            width = tensor->dims[1];
+            batch = tensor->dims[2];
+        }
 
-            if (TENGINE_DT_FP32 == tensor->data_type)
+        if (TENGINE_DT_FP32 == tensor->data_type)
+        {
+            fprintf(file, "Shape is {%d %d %d}, data type is fp32\n", batch, height, width);
+        }
+        else
+        {
+            if (TENGINE_DT_FP16 == tensor->data_type)
             {
-                fprintf(file, "Shape is {%d %d}, data type is fp32\n", batch, width);
+                fprintf(file, "Shape is {%d %d %d}, data type is fp16, cast to fp32\n", batch, height, width);
             }
             else
             {
-                if (TENGINE_DT_FP16 == tensor->data_type)
-                {
-                    fprintf(file, "Shape is {%d %d}, data type is fp16, cast to fp32\n", batch, width);
-                }
-                else
-                {
-                    const char* type_name = get_tensor_data_type_string(tensor->data_type);
-                    fprintf(file, "Shape is {%d %d}, data type is %s, inverse quantization to fp32\n", batch, width, type_name);
-                }
+                const char* type_name = get_tensor_data_type_string(tensor->data_type);
+                fprintf(file, "Shape is {%d %d %d}, data type is %s, inverse quantization to fp32\n", batch, height, width, type_name);
             }
+        }
 
-            for (int n = 0; n < batch; n++)
+        for (int n = 0; n < batch; n++)
+        {
+            for (int h = 0; h < height; h++)
             {
+                fprintf(file, "Channel %d:\n", h);
+                fprintf(file, "\t");
+
                 for (int w = 0; w < width; w++)
                 {
                     int offset = 0;
 
-                    offset += n * width;
-                    offset += w;
+                    if (TENGINE_LAYOUT_NCHW == tensor->layout)
+                    {
+                        offset += n * height * width;
+                        offset += h * width;
+                        offset += w;
+                    }
+                    if (TENGINE_LAYOUT_NHWC == tensor->layout)
+                    {
+                        offset += h;
+                        offset += n * width * height;
+                        offset += w * height;
+                    }
 
                     print_tensor_data_value(file, tensor, offset);
                 }
                 fprintf(file, "\n");
             }
+            fprintf(file, "\n");
+        }
+
+        break;
+    }
+    case 2:
+    {
+        int batch = 0, width = 0;
 
-            break;
+        if (TENGINE_LAYOUT_NCHW == tensor->layout)
+        {
+            batch = tensor->dims[0];
+            width = tensor->dims[1];
         }
-        case 1:
+        if (TENGINE_LAYOUT_NHWC == tensor->layout)
         {
-            int width = tensor->dims[0];
-
-            fprintf(file, "Shape is {%d}, data type is fp32\n", width);
+            batch = tensor->dims[0];
+            width = tensor->dims[1];
+        }
 
+        if (TENGINE_DT_FP32 == tensor->data_type)
+        {
+            fprintf(file, "Shape is {%d %d}, data type is fp32\n", batch, width);
+        }
+        else
+        {
+            if (TENGINE_DT_FP16 == tensor->data_type)
+            {
+                fprintf(file, "Shape is {%d %d}, data type is fp16, cast to fp32\n", batch, width);
+            }
+            else
+            {
+                const char* type_name = get_tensor_data_type_string(tensor->data_type);
+                fprintf(file, "Shape is {%d %d}, data type is %s, inverse quantization to fp32\n", batch, width, type_name);
+            }
+        }
 
+        for (int n = 0; n < batch; n++)
+        {
             for (int w = 0; w < width; w++)
             {
-                print_tensor_data_value(file, tensor, w);
+                int offset = 0;
+
+                offset += n * width;
+                offset += w;
+
+                print_tensor_data_value(file, tensor, offset);
             }
+            fprintf(file, "\n");
+        }
+
+        break;
+    }
+    case 1:
+    {
+        int width = tensor->dims[0];
 
-            break;
+        fprintf(file, "Shape is {%d}, data type is fp32\n", width);
+
+        for (int w = 0; w < width; w++)
+        {
+            print_tensor_data_value(file, tensor, w);
         }
-        default:
-            printf("Input dimension %d not to be supported.\n", tensor->dim_num);
+
+        break;
+    }
+    default:
+        printf("Input dimension %d not to be supported.\n", tensor->dim_num);
     }
 }
 
-
 /*
  * Extract the blob feature map
  */
 void extract_feature_from_tensor(const char* comment, const char* layer_name, const struct tensor* tensor)
 {
     // 1. deal with saving path
-    char save_dir[256] = { '0' };
+    char save_dir[256] = {'0'};
 
-    const char *env_path = getenv(TENGINE_DUMP_DIR);
+    const char* env_path = getenv(TENGINE_DUMP_DIR);
 
     if (NULL != env_path && (256 - 2) > strlen(env_path))
     {
@@ -552,7 +548,7 @@ void extract_feature_from_tensor(const char* comment, const char* layer_name, co
     }
     else
     {
-//        TLOG_WARNING("Tengine: Env var \"TENGINE_DUMP_DIR\" is too long(%d vs. 254). Using default path.\n", strlen(env_path));
+        //        TLOG_WARNING("Tengine: Env var \"TENGINE_DUMP_DIR\" is too long(%d vs. 254). Using default path.\n", strlen(env_path));
         sprintf(save_dir, "./output/");
 #ifdef _MSC_VER
         CreateDirectoryA(save_dir, NULL);
@@ -582,7 +578,7 @@ void extract_feature_from_tensor(const char* comment, const char* layer_name, co
     replace_string_character(layer_short_name, layer_legal_name, "/", "-");
 
     // 3. join path
-    char output_file_path[512] = { '0' };
+    char output_file_path[512] = {'0'};
 
     if (strlen(layer_legal_name) + strlen(save_dir) + strlen(comment) > 256 - 16)
     {
@@ -606,13 +602,12 @@ void extract_feature_from_tensor(const char* comment, const char* layer_name, co
     file = NULL;
 }
 
-
 void extract_node_executed_time(struct subgraph* subgraph, int node_id)
 {
     struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph;
     int node_num = get_vector_num(exec_graph->exec_node_list);
     int i = node_id;
-    struct exec_node* node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i);
+    struct exec_node* node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i);
 
     double* timer = (double*)exec_graph->timer;
 
@@ -638,51 +633,51 @@ void extract_node_executed_time(struct subgraph* subgraph, int node_id)
 
     switch (node->ir_node->op.type)
     {
-        case OP_CONV:
+    case OP_CONV:
+    {
+        struct conv_param* param = (struct conv_param*)node->ir_node->op.param_mem;
+        fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w,
+                param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1);
+        if (param->group != 1)
         {
-            struct conv_param* param = (struct conv_param*)node->ir_node->op.param_mem;
-            fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w,
-                    param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1);
-            if(param->group != 1)
-            {
-                fprintf(stdout, " DW(%3d) ", param->group);
-            }
-            else
-            {
-                fprintf(stdout, "         ");
-            }
-            break;
+            fprintf(stdout, " DW(%3d) ", param->group);
         }
-        case OP_DECONV:
+        else
         {
-            struct deconv_param* param = (struct deconv_param*)node->ir_node->op.param_mem;
-            fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w,
-                    param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1);
-            if(param->group != 1)
-            {
-                fprintf(stdout, " DW(%3d) ", param->group);
-            }
-            else
-            {
-                fprintf(stdout, "         ");
-            }
-            break;
+            fprintf(stdout, "         ");
         }
-        case OP_POOL:
+        break;
+    }
+    case OP_DECONV:
+    {
+        struct deconv_param* param = (struct deconv_param*)node->ir_node->op.param_mem;
+        fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w,
+                param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1);
+        if (param->group != 1)
         {
-            struct pool_param* param = (struct pool_param*)node->ir_node->op.param_mem;
-            fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w,
-                    param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1);
-            if(param->pool_method == 0)
-            {
-                fprintf(stdout, "         Max");
-            }
-            else
-            {
-                fprintf(stdout, "         Avg");
-            }
-            break;
+            fprintf(stdout, " DW(%3d) ", param->group);
         }
+        else
+        {
+            fprintf(stdout, "         ");
+        }
+        break;
+    }
+    case OP_POOL:
+    {
+        struct pool_param* param = (struct pool_param*)node->ir_node->op.param_mem;
+        fprintf(stdout, "K: %dx%d | S: %dx%d | P: %d %d %d %d", param->kernel_h, param->kernel_w, param->stride_h, param->stride_w,
+                param->pad_h0, param->pad_h1, param->pad_w0, param->pad_w1);
+        if (param->pool_method == 0)
+        {
+            fprintf(stdout, "         Max");
+        }
+        else
+        {
+            fprintf(stdout, "         Avg");
+        }
+        break;
+    }
     }
 
     if (OP_CONV == node->ir_node->op.type || OP_DECONV == node->ir_node->op.type)
@@ -699,9 +694,6 @@ void extract_node_executed_time(struct subgraph* subgraph, int node_id)
     }
 }
 
-
-
-
 double get_current_time(void)
 {
 #ifdef _MSC_VER
diff --git a/source/device/cpu/cpu_dump.h b/source/device/cpu/cpu_dump.h
index 23e9471a3..e916a2078 100644
--- a/source/device/cpu/cpu_dump.h
+++ b/source/device/cpu/cpu_dump.h
@@ -27,7 +27,6 @@
 struct tensor;
 struct subgraph;
 
-
 void extract_feature_from_tensor(const char* comment, const char* layer_name, const struct tensor* tensor);
 
 void extract_node_executed_time(struct subgraph* subgraph, int node_id);
diff --git a/source/device/cpu/cpu_graph.c b/source/device/cpu/cpu_graph.c
index 5136a2178..7032ed50c 100644
--- a/source/device/cpu/cpu_graph.c
+++ b/source/device/cpu/cpu_graph.c
@@ -39,10 +39,9 @@
 #include "utility/log.h"
 #include "serializer/serializer.h"
 
-
 static struct exec_graph* new_exec_graph(void)
 {
-    struct exec_graph* exec_graph = ( struct exec_graph* )sys_malloc(sizeof(struct exec_graph));
+    struct exec_graph* exec_graph = (struct exec_graph*)sys_malloc(sizeof(struct exec_graph));
 
     if (exec_graph == NULL)
         return NULL;
@@ -65,16 +64,15 @@ static struct exec_graph* new_exec_graph(void)
     return exec_graph;
 }
 
-
 void release_exec_graph(void* exec_graph)
 {
-    struct exec_graph* graph = ( struct exec_graph* )exec_graph;
+    struct exec_graph* graph = (struct exec_graph*)exec_graph;
 
     int node_num = get_vector_num(graph->exec_node_list);
 
     for (int i = 0; i < node_num; i++)
     {
-        struct exec_node* exec_node = ( struct exec_node* )get_vector_data(graph->exec_node_list, i);
+        struct exec_node* exec_node = (struct exec_node*)get_vector_data(graph->exec_node_list, i);
         struct node_ops* node_ops = exec_node->node_ops;
 
         release_exec_node(graph, exec_node, node_ops);
@@ -87,7 +85,6 @@ void release_exec_graph(void* exec_graph)
     sys_free(graph);
 }
 
-
 struct exec_graph* create_exec_graph(struct subgraph* subgraph, int num_thread, int mode, size_t cpu_affinity)
 {
     /* generate exec_graph */
@@ -138,19 +135,18 @@ struct exec_graph* create_exec_graph(struct subgraph* subgraph, int num_thread,
 
     return exec_graph;
 
-    error:
+error:
     release_exec_graph(exec_graph);
     return NULL;
 }
 
-
 int prerun_exec_graph(struct exec_graph* exec_graph)
 {
     int node_num = get_vector_num(exec_graph->exec_node_list);
 
     for (int i = 0; i < node_num; i++)
     {
-        struct exec_node* exec_node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i);
+        struct exec_node* exec_node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i);
         struct node_ops* node_ops = exec_node->node_ops;
 
         if (node_ops->prerun && node_ops->prerun(node_ops, exec_node, exec_graph) < 0)
diff --git a/source/device/cpu/cpu_graph.h b/source/device/cpu/cpu_graph.h
index 0e3fc61bd..e40497843 100644
--- a/source/device/cpu/cpu_graph.h
+++ b/source/device/cpu/cpu_graph.h
@@ -29,24 +29,22 @@
 
 #include <stddef.h>
 
-
 struct exec_graph
 {
-    struct vector*      exec_node_list;
-    struct mem_pool*    mem_pool;
-    struct cpu_device*  dev;
-
-    void*   shared_mem;
-    int     shared_mem_size;
-    void*   shared_pack4_mem;
-    int     shared_pack4_mem_size;
-    int     num_thread;
-    int     mode;
-    size_t  cpu_affinity;
-    void*   timer;
+    struct vector* exec_node_list;
+    struct mem_pool* mem_pool;
+    struct cpu_device* dev;
+
+    void* shared_mem;
+    int shared_mem_size;
+    void* shared_pack4_mem;
+    int shared_pack4_mem_size;
+    int num_thread;
+    int mode;
+    size_t cpu_affinity;
+    void* timer;
 };
 
-
 struct exec_graph* create_exec_graph(struct subgraph* subgraph, int num_thread, int mode, size_t cpu_affinity);
 
 int prerun_exec_graph(struct exec_graph* exec_graph);
diff --git a/source/device/cpu/cpu_module.c b/source/device/cpu/cpu_module.c
index eda4b21c5..7f024cb09 100644
--- a/source/device/cpu/cpu_module.c
+++ b/source/device/cpu/cpu_module.c
@@ -45,27 +45,24 @@
 #include "utility/log.h"
 #include "serializer/serializer.h"
 
-
 static struct vector** cpu_builtin_ops_registry;
-static struct vector*  cpu_custom_ops_registry;
+static struct vector* cpu_custom_ops_registry;
 
 #ifdef TENGINE_AUTO_LOAD_HCL
 void* hcl_handler = NULL;
 #endif
 
-
 struct custom_reg_entry
 {
     int op_type;
     struct node_ops* node_ops;
 };
 
-
 static int init_builtin_ops_registry(void)
 {
     int alloc_num = 0;
 
-    cpu_builtin_ops_registry = ( struct vector** )sys_malloc(sizeof(void*) * OP_BUILTIN_LAST);
+    cpu_builtin_ops_registry = (struct vector**)sys_malloc(sizeof(void*) * OP_BUILTIN_LAST);
 
     if (cpu_builtin_ops_registry == NULL)
         return -1;
@@ -83,7 +80,7 @@ static int init_builtin_ops_registry(void)
 
     return 0;
 
-    error:
+error:
     for (int i = 0; i < alloc_num; i++)
     {
         release_vector(cpu_builtin_ops_registry[i]);
@@ -148,7 +145,7 @@ static inline struct node_ops* find_builtin_node_ops(struct exec_graph* exec_gra
 
     for (int i = 0; i < num; i++)
     {
-        struct node_ops* node_ops = *( struct node_ops** )get_vector_data(ops_vector, i);
+        struct node_ops* node_ops = *(struct node_ops**)get_vector_data(ops_vector, i);
 
         int score = node_ops->score(node_ops, exec_graph, ir_node);
 
@@ -199,7 +196,7 @@ int register_custom_node_ops(int op_type, struct node_ops* node_ops)
 
     for (int i = 0; i < n; i++)
     {
-        struct custom_reg_entry* entry = ( struct custom_reg_entry* )get_vector_data(cpu_custom_ops_registry, i);
+        struct custom_reg_entry* entry = (struct custom_reg_entry*)get_vector_data(cpu_custom_ops_registry, i);
 
         if (entry->op_type == op_type)
         {
@@ -228,7 +225,7 @@ int unregister_custom_node_ops(int op_type, struct node_ops* node_ops)
 
     for (int i = 0; i < n; i++)
     {
-        struct custom_reg_entry* entry = ( struct custom_reg_entry* )get_vector_data(cpu_custom_ops_registry, i);
+        struct custom_reg_entry* entry = (struct custom_reg_entry*)get_vector_data(cpu_custom_ops_registry, i);
 
         if (entry->op_type == op_type && entry->node_ops == node_ops)
         {
@@ -247,7 +244,7 @@ static inline struct node_ops* find_custom_node_ops(struct exec_graph* exec_grap
 
     for (int i = 0; i < n; i++)
     {
-        struct custom_reg_entry* entry = ( struct custom_reg_entry* )get_vector_data(cpu_custom_ops_registry, i);
+        struct custom_reg_entry* entry = (struct custom_reg_entry*)get_vector_data(cpu_custom_ops_registry, i);
 
         if (entry->op_type == op_type)
             return entry->node_ops;
diff --git a/source/device/cpu/cpu_module.h b/source/device/cpu/cpu_module.h
index 347c93dcb..50edd5f8e 100644
--- a/source/device/cpu/cpu_module.h
+++ b/source/device/cpu/cpu_module.h
@@ -24,11 +24,9 @@
 
 #pragma once
 
-
 struct node_ops;
 struct exec_graph;
 
-
 int init_cpu_node_ops_registry(void);
 void release_cpu_node_ops_registry(void);
 
diff --git a/source/device/cpu/cpu_node.c b/source/device/cpu/cpu_node.c
index 7a14b8eac..9ea1aa72e 100644
--- a/source/device/cpu/cpu_node.c
+++ b/source/device/cpu/cpu_node.c
@@ -28,7 +28,6 @@
 #include "graph/node.h"
 #include "utility/sys_port.h"
 
-
 int init_exec_node(struct exec_graph* exec_graph, struct exec_node* exec_node, struct node* ir_node, struct node_ops* node_ops)
 {
     exec_node->ir_node = ir_node;
@@ -44,7 +43,7 @@ int init_exec_node(struct exec_graph* exec_graph, struct exec_node* exec_node, s
 
     if (exec_node->output_num > 4)
     {
-        exec_node->block_id_ptr = ( int8_t* )sys_malloc(sizeof(int8_t) * exec_node->output_num);
+        exec_node->block_id_ptr = (int8_t*)sys_malloc(sizeof(int8_t) * exec_node->output_num);
         block_id = exec_node->block_id_ptr;
     }
 
@@ -57,7 +56,6 @@ int init_exec_node(struct exec_graph* exec_graph, struct exec_node* exec_node, s
     return 0;
 }
 
-
 void release_exec_node(struct exec_graph* exec_graph, struct exec_node* exec_node, struct node_ops* node_ops)
 {
     if (node_ops->release_node)
diff --git a/source/device/cpu/cpu_node.h b/source/device/cpu/cpu_node.h
index 8787a2929..b0c2fa575 100644
--- a/source/device/cpu/cpu_node.h
+++ b/source/device/cpu/cpu_node.h
@@ -29,31 +29,29 @@
 
 #include <stdint.h>
 
-
 struct node;
 struct node_ops;
 struct exec_node;
 struct exec_graph;
 
-
 struct exec_node
 {
-    struct node*        ir_node;
-    struct node_ops*    node_ops;
-    void*               ops_priv; /* priv data for ops */
+    struct node* ir_node;
+    struct node_ops* node_ops;
+    void* ops_priv; /* priv data for ops */
 
-    int8_t              inplace_map_num;
-    int8_t              output_num;
+    int8_t inplace_map_num;
+    int8_t output_num;
 
     union
     {
         uint8_t* inplace_map_ptr;
-        uint8_t  inplace_map[4]; /* opt for single inplace map, such as relu */
+        uint8_t inplace_map[4]; /* opt for single inplace map, such as relu */
     };
 
     union
     {
-        int8_t  block_id[4];
+        int8_t block_id[4];
         int8_t* block_id_ptr;
     };
 
@@ -61,7 +59,6 @@ struct exec_node
     int shared_pack4_mem_size;
 };
 
-
 struct node_ops
 {
     int (*prerun)(struct node_ops*, struct exec_node*, struct exec_graph*);
diff --git a/source/device/cpu/cpu_pool.c b/source/device/cpu/cpu_pool.c
index 9a848bd53..21a4917e4 100644
--- a/source/device/cpu/cpu_pool.c
+++ b/source/device/cpu/cpu_pool.c
@@ -36,8 +36,6 @@
 #include "utility/vector.h"
 #include "utility/log.h"
 
-
-
 struct mem_record
 {
     struct tensor* ir_tensor;
@@ -45,7 +43,6 @@ struct mem_record
     int block_id;
 };
 
-
 static int find_inplace_input(struct exec_node* exec_node, int output_slot, struct node* ir_node, struct graph* ir_graph)
 {
     if (exec_node->inplace_map_num == 0)
@@ -79,14 +76,13 @@ static int find_inplace_input(struct exec_node* exec_node, int output_slot, stru
     return input_slot;
 }
 
-
 static int find_tensor_mem_list(struct vector* tensor_mem_list, const struct tensor* ir_tensor)
 {
     int rec_number = get_vector_num(tensor_mem_list);
 
     for (int i = 0; i < rec_number; i++)
     {
-        struct mem_record* rec = ( struct mem_record* )get_vector_data(tensor_mem_list, i);
+        struct mem_record* rec = (struct mem_record*)get_vector_data(tensor_mem_list, i);
 
         if (rec->ir_tensor == ir_tensor)
             return i;
@@ -95,7 +91,6 @@ static int find_tensor_mem_list(struct vector* tensor_mem_list, const struct ten
     return -1;
 }
 
-
 void free_exec_graph_mem(struct exec_graph* graph)
 {
     /* free the shared memory */
@@ -121,7 +116,6 @@ void free_exec_graph_mem(struct exec_graph* graph)
     }
 }
 
-
 static void mem_pool_dump(struct mem_pool* mem_pool)
 {
     int block_number = get_vector_num(mem_pool->block_list);
@@ -130,7 +124,7 @@ static void mem_pool_dump(struct mem_pool* mem_pool)
 
     for (int i = 0; i < block_number; i++)
     {
-        struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, i);
+        struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, i);
 
         TLOG_INFO("Tengine: %d: %p (%d) used: %d free: %d\n", i, entry->addr, entry->block_size, entry->alloc_count,
                   entry->free_count);
@@ -139,12 +133,12 @@ static void mem_pool_dump(struct mem_pool* mem_pool)
 
 static void* mem_pool_get_mem_block(struct mem_pool* mem_pool, int block_id)
 {
-    struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, block_id);
+    struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, block_id);
 
     size_t addr = (size_t)(entry->addr);
     size_t aligned_addr = (addr + 4 + mem_pool->align_size) & (~(mem_pool->align_size - 1));
 
-    return ( void* )aligned_addr;
+    return (void*)aligned_addr;
 }
 
 static int mem_pool_get_backend_mem(struct mem_pool* mem_pool)
@@ -153,7 +147,7 @@ static int mem_pool_get_backend_mem(struct mem_pool* mem_pool)
 
     for (int i = 0; i < block_num; i++)
     {
-        struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, i);
+        struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, i);
 
         entry->block_size = entry->max_req_size + mem_pool->align_size + 128;
 
@@ -173,7 +167,7 @@ static int mem_pool_allocate(struct mem_pool* mem_pool, int size)
 
     for (int i = 0; i < block_num; i++)
     {
-        struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, i);
+        struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, i);
 
         if (entry->free_count != entry->alloc_count)
             continue;
@@ -202,15 +196,13 @@ static int mem_pool_allocate(struct mem_pool* mem_pool, int size)
     return block_num;
 }
 
-
 static void mem_pool_free(struct mem_pool* mem_pool, int block_id)
 {
-    struct mem_block_entry* block = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, block_id);
+    struct mem_block_entry* block = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, block_id);
 
     block->free_count++;
 }
 
-
 void release_mem_pool(struct mem_pool* mem_pool)
 {
     if (mem_pool->block_list != NULL)
@@ -219,7 +211,7 @@ void release_mem_pool(struct mem_pool* mem_pool)
 
         for (int i = 0; i < block_num; i++)
         {
-            struct mem_block_entry* entry = ( struct mem_block_entry* )get_vector_data(mem_pool->block_list, i);
+            struct mem_block_entry* entry = (struct mem_block_entry*)get_vector_data(mem_pool->block_list, i);
 
             sys_free(entry->addr);
         }
@@ -230,10 +222,9 @@ void release_mem_pool(struct mem_pool* mem_pool)
     sys_free(mem_pool);
 }
 
-
 static struct mem_pool* create_mem_pool(void)
 {
-    struct mem_pool* mem_pool = ( struct mem_pool* )sys_malloc(sizeof(struct mem_pool));
+    struct mem_pool* mem_pool = (struct mem_pool*)sys_malloc(sizeof(struct mem_pool));
 
     if (mem_pool == NULL)
         return NULL;
@@ -252,7 +243,7 @@ static struct mem_pool* create_mem_pool(void)
 
     return mem_pool;
 
-    error:
+error:
 
     release_mem_pool(mem_pool);
 
@@ -281,7 +272,7 @@ int alloc_exec_graph_mem(struct exec_graph* exec_graph)
 
     for (int i = 0; i < node_num; i++)
     {
-        struct exec_node* exec_node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i);
+        struct exec_node* exec_node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i);
         struct node* ir_node = exec_node->ir_node;
         struct graph* ir_graph = ir_node->graph;
 
@@ -311,7 +302,7 @@ int alloc_exec_graph_mem(struct exec_graph* exec_graph)
                 if (idx < 0)
                     continue;
 
-                struct mem_record* input_r = ( struct mem_record* )get_vector_data(tensor_mem_list, idx);
+                struct mem_record* input_r = (struct mem_record*)get_vector_data(tensor_mem_list, idx);
 
                 input_r->ir_tensor = ir_tensor;
                 input_r->used = ir_tensor->consumer_num;
@@ -346,7 +337,7 @@ int alloc_exec_graph_mem(struct exec_graph* exec_graph)
             if (idx < 0)
                 continue;
 
-            struct mem_record* input_r = ( struct mem_record* )get_vector_data(tensor_mem_list, idx);
+            struct mem_record* input_r = (struct mem_record*)get_vector_data(tensor_mem_list, idx);
 
             input_r->used--;
 
@@ -406,7 +397,7 @@ int alloc_exec_graph_mem(struct exec_graph* exec_graph)
     /* now, the real allocate */
     for (int i = 0; i < node_num; i++)
     {
-        struct exec_node* exec_node = ( struct exec_node* )get_vector_data(exec_graph->exec_node_list, i);
+        struct exec_node* exec_node = (struct exec_node*)get_vector_data(exec_graph->exec_node_list, i);
         struct node* ir_node = exec_node->ir_node;
         struct graph* ir_graph = ir_node->graph;
         struct mem_pool* local_mem_pool = exec_graph->mem_pool;
diff --git a/source/device/cpu/cpu_pool.h b/source/device/cpu/cpu_pool.h
index 0037fbfed..126a09633 100644
--- a/source/device/cpu/cpu_pool.h
+++ b/source/device/cpu/cpu_pool.h
@@ -31,7 +31,6 @@
 
 struct exec_graph;
 
-
 struct mem_block_entry
 {
     void* addr;
@@ -46,14 +45,13 @@ struct mem_pool
     uint8_t align_size; /* must be 2^n */
     struct vector* block_list;
 
-    int   (*get_backend_mem)(struct mem_pool*);
+    int (*get_backend_mem)(struct mem_pool*);
     void* (*get_mem_block)(struct mem_pool*, int block_id);
-    int   (*allocate)(struct mem_pool*, int size);
-    void  (*free)(struct mem_pool*, int block_id);
-    void  (*dump)(struct mem_pool*);
+    int (*allocate)(struct mem_pool*, int size);
+    void (*free)(struct mem_pool*, int block_id);
+    void (*dump)(struct mem_pool*);
 };
 
-
 void release_mem_pool(struct mem_pool* mem_pool);
 int alloc_exec_graph_mem(struct exec_graph* exec_graph);
 void free_exec_graph_mem(struct exec_graph* graph);
diff --git a/source/device/cpu/op/absval/absval_ref.c b/source/device/cpu/op/absval/absval_ref.c
index 925b0fd82..973bbae6d 100644
--- a/source/device/cpu/op/absval/absval_ref.c
+++ b/source/device/cpu/op/absval/absval_ref.c
@@ -33,25 +33,21 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
 }
 
-
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
 }
 
-
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
 }
 
-
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -62,8 +58,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    float* idata = ( float* )input_tensor->data;
-    float* odata = ( float* )output_tensor->data;
+    float* idata = (float*)input_tensor->data;
+    float* odata = (float*)output_tensor->data;
 
     for (uint32_t i = 0; i < output_tensor->elem_num; i++)
     {
@@ -75,7 +71,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     return 0;
 }
 
-
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
     struct node* ir_node = exec_node;
@@ -90,7 +85,6 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-
 static struct node_ops hcl_node_ops = {.prerun = prerun,
                                        .run = run,
                                        .reshape = NULL,
@@ -104,7 +98,6 @@ int register_absval_ref_op()
     return register_builtin_node_ops(OP_ABSVAL, &hcl_node_ops);
 }
 
-
 int unregister_absval_ref_op()
 {
     return unregister_builtin_node_ops(OP_ABSVAL, &hcl_node_ops);
diff --git a/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c b/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c
index 960f1f198..c01c37a0c 100644
--- a/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c
+++ b/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c
@@ -35,25 +35,21 @@
 
 #include <arm_neon.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
 }
 
-
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
 }
 
-
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
 }
 
-
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -64,8 +60,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    float* idata = ( float* )input_tensor->data;
-    float* odata = ( float* )output_tensor->data;
+    float* idata = (float*)input_tensor->data;
+    float* odata = (float*)output_tensor->data;
 
     int channel_num = input_tensor->dims[1];
     int batch_number = input_tensor->dims[0];
@@ -99,7 +95,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     return 0;
 }
 
-
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
     struct node* ir_node = exec_node;
@@ -114,7 +109,6 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-
 static struct node_ops hcl_node_ops = {.prerun = prerun,
                                        .run = run,
                                        .reshape = NULL,
@@ -123,13 +117,11 @@ static struct node_ops hcl_node_ops = {.prerun = prerun,
                                        .release_node = release_node,
                                        .score = score};
 
-
 int register_absval_hcl_arm_op()
 {
     return register_builtin_node_ops(OP_ABSVAL, &hcl_node_ops);
 }
 
-
 int unregister_absval_hcl_arm_op()
 {
     return unregister_builtin_node_ops(OP_ABSVAL, &hcl_node_ops);
diff --git a/source/device/cpu/op/add_n/add_n_ref.c b/source/device/cpu/op/add_n/add_n_ref.c
index 69b9d54dc..559b6cc44 100644
--- a/source/device/cpu/op/add_n/add_n_ref.c
+++ b/source/device/cpu/op/add_n/add_n_ref.c
@@ -33,7 +33,6 @@
 
 #include <math.h>
 
-
 struct add_n_op_param
 {
     int in_num;
@@ -56,7 +55,7 @@ static int ref_add_n_fp32(const float** input, float* output, int size, const st
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct add_n_op_param* add_n_op_param = ( struct add_n_op_param* )sys_malloc(sizeof(struct add_n_op_param));
+    struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)sys_malloc(sizeof(struct add_n_op_param));
     exec_node->ops_priv = add_n_op_param;
 
     return 0;
@@ -72,12 +71,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct add_n_op_param* add_n_op_param = ( struct add_n_op_param* )exec_node->ops_priv;
+    struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 
     int in_num = ir_node->input_num;
     add_n_op_param->in_num = in_num;
-    add_n_op_param->input_data = ( void** )sys_malloc(sizeof(void*) * in_num);
+    add_n_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num);
 
     return 0;
 }
@@ -90,27 +89,27 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
     uint32_t elem_num = input_tensor_a->elem_num;
-    struct add_n_op_param* add_n_op_param = ( struct add_n_op_param* )exec_node->ops_priv;
+    struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv;
     for (int i = 0; i < add_n_op_param->in_num; i++)
     {
         struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]);
         void* data = input_tensor->data;
         add_n_op_param->input_data[i] = data;
     }
-    const void** input = ( const void** )add_n_op_param->input_data;
+    const void** input = (const void**)add_n_op_param->input_data;
 
     float* output = (float*)output_tensor->data;
     for (uint32_t i = 0; i < elem_num; i++)
     {
         output[i] = 0;
     }
-    ref_add_n_fp32(( const float** )input, output, elem_num, add_n_op_param);
+    ref_add_n_fp32((const float**)input, output, elem_num, add_n_op_param);
     return 0;
 }
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct add_n_op_param* add_n_op_param = ( struct add_n_op_param* )exec_node->ops_priv;
+    struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv;
     sys_free(add_n_op_param->input_data);
 
     return 0;
diff --git a/source/device/cpu/op/argmax/argmax_ref.c b/source/device/cpu/op/argmax/argmax_ref.c
index 413f982a5..ba8898a38 100644
--- a/source/device/cpu/op/argmax/argmax_ref.c
+++ b/source/device/cpu/op/argmax/argmax_ref.c
@@ -36,7 +36,6 @@
 
 #include <stdio.h>
 
-
 struct argmax_op_param
 {
     int axis;
@@ -112,7 +111,7 @@ static int ref_argmax_uint8(uint8_t* input, int* output, const struct argmax_op_
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct argmax_op_param* argmax_op_param = ( struct argmax_op_param* )sys_malloc(sizeof(struct argmax_op_param));
+    struct argmax_op_param* argmax_op_param = (struct argmax_op_param*)sys_malloc(sizeof(struct argmax_op_param));
     argmax_op_param->axis = 0;
     argmax_op_param->axis_size = 1;
     argmax_op_param->inner_size = 1;
@@ -137,8 +136,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct argmax_op_param* argmax_op_param = ( struct argmax_op_param* )exec_node->ops_priv;
-    struct argmax_param* argmax_param = ( struct argmax_param* )ir_node->op.param_mem;
+    struct argmax_op_param* argmax_op_param = (struct argmax_op_param*)exec_node->ops_priv;
+    struct argmax_param* argmax_param = (struct argmax_param*)ir_node->op.param_mem;
     argmax_op_param->axis = argmax_param->axis;
     argmax_op_param->keepdims = argmax_param->keepdims;
     argmax_op_param->axis_size = input_tensor->dims[argmax_param->axis];
@@ -174,15 +173,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     void* in_data = input_tensor->data;
     void* out_data = output_tensor->data;
 
-    struct argmax_op_param* argmax_op_param = ( struct argmax_op_param* )exec_node->ops_priv;
+    struct argmax_op_param* argmax_op_param = (struct argmax_op_param*)exec_node->ops_priv;
 
     TLOG_ERR("output_tensor->elem_num:%d\n", output_tensor->elem_num);
     TLOG_ERR("output_tensor->elem_size:%d\n", output_tensor->elem_size);
 
     if (input_tensor->data_type == TENGINE_DT_FP32)
-        ref_argmax_fp32(( float* )in_data, (int*)out_data, argmax_op_param);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
-        ref_argmax_uint8(( uint8_t* )in_data, (int*)out_data, argmax_op_param);
+        ref_argmax_fp32((float*)in_data, (int*)out_data, argmax_op_param);
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
+        ref_argmax_uint8((uint8_t*)in_data, (int*)out_data, argmax_op_param);
 
     return 0;
 }
diff --git a/source/device/cpu/op/argmin/argmin_ref.c b/source/device/cpu/op/argmin/argmin_ref.c
index 730bd4155..58da946b0 100644
--- a/source/device/cpu/op/argmin/argmin_ref.c
+++ b/source/device/cpu/op/argmin/argmin_ref.c
@@ -36,7 +36,6 @@
 
 #include <stdio.h>
 
-
 struct argmin_op_param
 {
     int axis;
@@ -112,7 +111,7 @@ static int ref_argmin_uint8(uint8_t* input, int* output, const struct argmin_op_
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct argmin_op_param* argmin_op_param = ( struct argmin_op_param* )sys_malloc(sizeof(struct argmin_op_param));
+    struct argmin_op_param* argmin_op_param = (struct argmin_op_param*)sys_malloc(sizeof(struct argmin_op_param));
     argmin_op_param->axis = 0;
     argmin_op_param->axis_size = 1;
     argmin_op_param->inner_size = 1;
@@ -137,8 +136,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct argmin_op_param* argmin_op_param = ( struct argmin_op_param* )exec_node->ops_priv;
-    struct argmin_param* argmin_param = ( struct argmin_param* )ir_node->op.param_mem;
+    struct argmin_op_param* argmin_op_param = (struct argmin_op_param*)exec_node->ops_priv;
+    struct argmin_param* argmin_param = (struct argmin_param*)ir_node->op.param_mem;
     argmin_op_param->axis = argmin_param->axis;
     argmin_op_param->keepdims = argmin_param->keepdims;
     argmin_op_param->axis_size = input_tensor->dims[argmin_param->axis];
@@ -174,15 +173,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     void* in_data = input_tensor->data;
     void* out_data = output_tensor->data;
 
-    struct argmin_op_param* argmin_op_param = ( struct argmin_op_param* )exec_node->ops_priv;
+    struct argmin_op_param* argmin_op_param = (struct argmin_op_param*)exec_node->ops_priv;
 
     TLOG_ERR("output_tensor->elem_num:%d\n", output_tensor->elem_num);
     TLOG_ERR("output_tensor->elem_size:%d\n", output_tensor->elem_size);
 
     if (input_tensor->data_type == TENGINE_DT_FP32)
-        ref_argmin_fp32(( float* )in_data, (int*)out_data, argmin_op_param);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
-        ref_argmin_uint8(( uint8_t* )in_data, (int*)out_data, argmin_op_param);
+        ref_argmin_fp32((float*)in_data, (int*)out_data, argmin_op_param);
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
+        ref_argmin_uint8((uint8_t*)in_data, (int*)out_data, argmin_op_param);
 
     return 0;
 }
diff --git a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h
index 59f499d14..f655e481f 100644
--- a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h
+++ b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h
@@ -25,7 +25,6 @@
 #ifndef __BATCHNORM_KERNEL_REF_H__
 #define __BATCHNORM_KERNEL_REF_H__
 
-
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
diff --git a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c
index b06390e88..de8c49d84 100644
--- a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c
+++ b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c
@@ -36,7 +36,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 int ref_batchnorm_fp32(float* input, float* output, const struct ref_batchnorm_param* param)
 {
     float* scale_mean = param->scale_mean;
diff --git a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c
index e129ae1ce..76d2414fe 100644
--- a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c
+++ b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c
@@ -36,7 +36,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 int ref_batchnorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor, const struct ref_batchnorm_param* param)
 {
     float* scale_mean = param->scale_mean;
@@ -55,9 +54,9 @@ int ref_batchnorm_uint8(struct tensor* input_tensor, struct tensor* output_tenso
     int32_t input_zero = input_tensor->zero_point;
     int32_t output_zero = output_tensor->zero_point;
 
-    float* data_fp32 = (float*) sys_malloc(total_size * sizeof(float));
-    for(int i = 0; i < total_size; i++)
-        data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale;
+    float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float));
+    for (int i = 0; i < total_size; i++)
+        data_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
 
     for (int n = 0; n < param->input_n; ++n)
     {
@@ -87,7 +86,7 @@ int ref_batchnorm_uint8(struct tensor* input_tensor, struct tensor* output_tenso
     }
 
     // quant
-    for(int i=0; i<total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
         int udata = (int)roundf(data_fp32[i] / output_scale + output_zero);
         if (udata > 255)
diff --git a/source/device/cpu/op/batchnorm/batchnorm_ref.c b/source/device/cpu/op/batchnorm/batchnorm_ref.c
index fd8dade7c..25f381310 100644
--- a/source/device/cpu/op/batchnorm/batchnorm_ref.c
+++ b/source/device/cpu/op/batchnorm/batchnorm_ref.c
@@ -38,11 +38,9 @@
 
 #include "batchnorm_kernel_ref.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct ref_batchnorm_param* batchnorm_op_param =
-        ( struct ref_batchnorm_param* )sys_malloc(sizeof(struct ref_batchnorm_param));
+    struct ref_batchnorm_param* batchnorm_op_param = (struct ref_batchnorm_param*)sys_malloc(sizeof(struct ref_batchnorm_param));
     memset(batchnorm_op_param, 0, sizeof(struct ref_batchnorm_param));
     exec_node->ops_priv = batchnorm_op_param;
     return 0;
@@ -63,15 +61,15 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* mean_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[3]);
     struct tensor* var_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[4]);
 
-    struct ref_batchnorm_param* op_param = ( struct ref_batchnorm_param* )exec_node->ops_priv;
-    struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )ir_node->op.param_mem;
+    struct ref_batchnorm_param* op_param = (struct ref_batchnorm_param*)exec_node->ops_priv;
+    struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)ir_node->op.param_mem;
 
     int channel_num = input_tensor->dims[1];
 
-    float* scale_mean = ( float* )sys_malloc(channel_num * sizeof(float));
-    float* scale_var_inv = ( float* )sys_malloc(channel_num * sizeof(float));
-    const float* mean = ( const float* )mean_tensor->data;
-    const float* var = ( const float* )var_tensor->data;
+    float* scale_mean = (float*)sys_malloc(channel_num * sizeof(float));
+    float* scale_var_inv = (float*)sys_malloc(channel_num * sizeof(float));
+    const float* mean = (const float*)mean_tensor->data;
+    const float* var = (const float*)var_tensor->data;
 
     float rescale_factor;
     float eps = batchnorm_param->eps;
@@ -81,9 +79,9 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     for (int c = 0; c < channel_num; c++)
     {
         float tmp = sqrtf(var[c] * rescale_factor + eps);
-        scale_var_inv[c] = ( float )(1.f / tmp);
+        scale_var_inv[c] = (float)(1.f / tmp);
         tmp = rescale_factor * scale_var_inv[c];
-        scale_mean[c] = ( float )(-mean[c] * tmp);
+        scale_mean[c] = (float)(-mean[c] * tmp);
     }
     float* gamma = NULL;
     float* beta = NULL;
@@ -91,8 +89,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     {
         const struct tensor* gamma_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
         const struct tensor* beta_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
-        gamma = ( float* )gamma_tensor->data;
-        beta = ( float* )beta_tensor->data;
+        gamma = (float*)gamma_tensor->data;
+        beta = (float*)beta_tensor->data;
     }
     int layout = ir_graph->graph_layout;
     op_param->iscaffe = batchnorm_param->caffe_flavor;
@@ -112,7 +110,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct ref_batchnorm_param* batchnorm_op_param = ( struct ref_batchnorm_param* )exec_node->ops_priv;
+    struct ref_batchnorm_param* batchnorm_op_param = (struct ref_batchnorm_param*)exec_node->ops_priv;
     void* out_data = output_tensor->data;
     void* input = input_tensor->data;
 
@@ -134,7 +132,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     {
         return -1;
     }
-    
+
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_batchnorm_fp32((float*)input, (float*)out_data, batchnorm_op_param);
@@ -146,7 +144,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct ref_batchnorm_param* batchnorm_op_param = ( struct ref_batchnorm_param* )exec_node->ops_priv;
+    struct ref_batchnorm_param* batchnorm_op_param = (struct ref_batchnorm_param*)exec_node->ops_priv;
 
     sys_free(batchnorm_op_param->scale_mean);
     sys_free(batchnorm_op_param->scale_var_inv);
diff --git a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c
index f455f1f67..359b14ee5 100644
--- a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c
+++ b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c
@@ -35,11 +35,9 @@
 #include <math.h>
 #include <string.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct hcl_batchnorm_param* op_param =
-        ( struct hcl_batchnorm_param* )sys_malloc(sizeof(struct hcl_batchnorm_param));
+    struct hcl_batchnorm_param* op_param = (struct hcl_batchnorm_param*)sys_malloc(sizeof(struct hcl_batchnorm_param));
     memset(op_param, 0, sizeof(struct hcl_batchnorm_param));
     exec_node->ops_priv = op_param;
     return 0;
@@ -61,13 +59,13 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 
     int channel_num = mean_tensor->dims[0];
 
-    float* scale_mean = ( float* )sys_malloc(channel_num * sizeof(float));
-    float* scale_var_inv = ( float* )sys_malloc(channel_num * sizeof(float));
+    float* scale_mean = (float*)sys_malloc(channel_num * sizeof(float));
+    float* scale_var_inv = (float*)sys_malloc(channel_num * sizeof(float));
 
-    const float* mean = ( const float* )mean_tensor->data;
-    const float* var = ( const float* )var_tensor->data;
+    const float* mean = (const float*)mean_tensor->data;
+    const float* var = (const float*)var_tensor->data;
 
-    struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )ir_node->op.param_mem;
+    struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)ir_node->op.param_mem;
 
     float rescale_factor;
     float eps = batchnorm_param->eps;
@@ -76,16 +74,16 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     for (int c = 0; c < channel_num; c++)
     {
         float tmp = sqrt(var[c] * rescale_factor + eps);
-        scale_var_inv[c] = ( float )(1.f / tmp);
+        scale_var_inv[c] = (float)(1.f / tmp);
         tmp = rescale_factor * scale_var_inv[c];
-        scale_mean[c] = ( float )(-mean[c] * tmp);
+        scale_mean[c] = (float)(-mean[c] * tmp);
     }
     if (!batchnorm_param->caffe_flavor)
     {
         const struct tensor* gamma_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
         const struct tensor* beta_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
-        const float* gamma = ( const float* )gamma_tensor->data;
-        const float* beta = ( const float* )beta_tensor->data;
+        const float* gamma = (const float*)gamma_tensor->data;
+        const float* beta = (const float*)beta_tensor->data;
         for (int c = 0; c < channel_num; c++)
         {
             scale_var_inv[c] *= gamma[c];
@@ -94,7 +92,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
         }
     }
 
-    struct hcl_batchnorm_param* op_param = ( struct hcl_batchnorm_param* )exec_node->ops_priv;
+    struct hcl_batchnorm_param* op_param = (struct hcl_batchnorm_param*)exec_node->ops_priv;
     op_param->scale_mean = scale_mean;
     op_param->scale_var_inv = scale_var_inv;
 
@@ -111,7 +109,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct hcl_batchnorm_param* op_param = ( struct hcl_batchnorm_param* )exec_node->ops_priv;
+    struct hcl_batchnorm_param* op_param = (struct hcl_batchnorm_param*)exec_node->ops_priv;
     float* scale_mean = op_param->scale_mean;
     float* scale_var_inv = op_param->scale_var_inv;
     int num_thread = exec_graph->num_thread;
@@ -123,7 +121,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct hcl_batchnorm_param* op_param = ( struct hcl_batchnorm_param* )exec_node->ops_priv;
+    struct hcl_batchnorm_param* op_param = (struct hcl_batchnorm_param*)exec_node->ops_priv;
     sys_free(op_param->scale_mean);
     sys_free(op_param->scale_var_inv);
 
diff --git a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.c b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.c
index cd9a5835c..181648a08 100644
--- a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.c
+++ b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.c
@@ -26,11 +26,10 @@
 
 #include <arm_neon.h>
 
-
 static void batchnorm_kernel(int i, int id, void* data, const float* input, float* output, float* scale_mean,
                              float* scale_var, int channel_size, int num_thread)
 {
-    int step = (( int* )data)[0];
+    int step = ((int*)data)[0];
 
 #pragma omp parallel for num_threads(num_thread)
     for (int c = 0; c < step; c++)
@@ -68,11 +67,11 @@ int batchnorm_run(struct tensor* output_tensor, struct tensor* input_tensor, flo
     int channel_size = (input_tensor->dims[2]) * (input_tensor->dims[3]);
     int img_size = channel_num * channel_size;
 
-    const float* input = ( const float* )input_tensor->data;
-    float* output = ( float* )output_tensor->data;
+    const float* input = (const float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
 
-    float* scale_mean_t = ( float* )scale_mean;
-    float* scale_var_inv_t = ( float* )scale_var_inv;
+    float* scale_mean_t = (float*)scale_mean;
+    float* scale_var_inv_t = (float*)scale_var_inv;
 
     /* only use mean and var */
     for (int i = 0; i < batch_number; i++)
diff --git a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.h b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.h
index 9bff1df8b..58a3d3507 100644
--- a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.h
+++ b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_kernel_arm.h
@@ -29,7 +29,6 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 struct hcl_batchnorm_param
 {
     float* scale_mean;
diff --git a/source/device/cpu/op/batchtospacend/batchtospacend_ref.c b/source/device/cpu/op/batchtospacend/batchtospacend_ref.c
index 770cabb76..9c9aa6044 100644
--- a/source/device/cpu/op/batchtospacend/batchtospacend_ref.c
+++ b/source/device/cpu/op/batchtospacend/batchtospacend_ref.c
@@ -36,7 +36,6 @@
 #include <math.h>
 #include <string.h>
 
-
 static int ref_batchtospacend_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
                                    struct batchtospacend_param* param, int num_thread)
 {
@@ -62,25 +61,21 @@ static int ref_batchtospacend_fp32(struct tensor* input_tensor, struct tensor* o
         const int spatial_offset = (int)roundf(in_batch / out_dims[0]);
         for (int in_h = 0; in_h < in_dims[1]; ++in_h)
         {
-            const int out_h =
-                    (int)roundf(in_h * (param->dilation_y) + spatial_offset / (param->dilation_x) - param->crop_top);
+            const int out_h = (int)roundf(in_h * (param->dilation_y) + spatial_offset / (param->dilation_x) - param->crop_top);
 
             if (out_h < 0 || out_h >= out_dims[1])
                 continue;
 
             for (int in_w = 0; in_w < in_dims[2]; ++in_w)
             {
-                const int out_w =
-                        (int)roundf(in_w * param->dilation_x + spatial_offset % param->dilation_x - param->crop_left);
+                const int out_w = (int)roundf(in_w * param->dilation_x + spatial_offset % param->dilation_x - param->crop_left);
 
                 if (out_w < 0 || out_w >= out_dims[2])
                     continue;
 
-                int outOffset = (int)roundf(out_batch * out_dims[1] * out_dims[2] * out_dims[3] +
-                                      out_h * out_dims[2] * out_dims[3] + out_w * in_dims[3]);
+                int outOffset = (int)roundf(out_batch * out_dims[1] * out_dims[2] * out_dims[3] + out_h * out_dims[2] * out_dims[3] + out_w * in_dims[3]);
                 float* out = out_data + outOffset;
-                int inOffset = (int)roundf(in_batch * in_dims[1] * in_dims[2] * in_dims[3] + in_h * in_dims[2] * in_dims[3] +
-                                     in_w * in_dims[3]);
+                int inOffset = (int)roundf(in_batch * in_dims[1] * in_dims[2] * in_dims[3] + in_h * in_dims[2] * in_dims[3] + in_w * in_dims[3]);
                 const float* in = in_data + inOffset;
                 memcpy(out, in, in_dims[3] * sizeof(float));
             }
@@ -109,7 +104,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct batchtospacend_param* batchtospacend_param = ( struct batchtospacend_param* )ir_node->op.param_mem;
+    struct batchtospacend_param* batchtospacend_param = (struct batchtospacend_param*)ir_node->op.param_mem;
 
     ref_batchtospacend_fp32(input_tensor, output_tensor, batchtospacend_param, exec_graph->num_thread);
 
diff --git a/source/device/cpu/op/bias/bias_ref.c b/source/device/cpu/op/bias/bias_ref.c
index c9653295c..2eb39c085 100644
--- a/source/device/cpu/op/bias/bias_ref.c
+++ b/source/device/cpu/op/bias/bias_ref.c
@@ -33,7 +33,6 @@
 
 #include <math.h>
 
-
 int ref_bias_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* bias_tensor,
                   int num_thread)
 {
diff --git a/source/device/cpu/op/broadmul/broadmul_ref.c b/source/device/cpu/op/broadmul/broadmul_ref.c
index 7d91e9f01..92ed72a28 100644
--- a/source/device/cpu/op/broadmul/broadmul_ref.c
+++ b/source/device/cpu/op/broadmul/broadmul_ref.c
@@ -33,7 +33,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -151,4 +150,3 @@ int unregister_broadmul_ref_op()
 {
     return unregister_builtin_node_ops(OP_BROADMUL, &hcl_node_ops);
 }
-
diff --git a/source/device/cpu/op/cast/cast_ref.c b/source/device/cpu/op/cast/cast_ref.c
index 8fcb58772..9eb88fb16 100644
--- a/source/device/cpu/op/cast/cast_ref.c
+++ b/source/device/cpu/op/cast/cast_ref.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -69,7 +68,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     int num_thread = exec_graph->num_thread;
 
-
     if (input_tensor->elem_num != output_tensor->elem_num || input_tensor->dim_num != output_tensor->dim_num)
     {
         return -1;
diff --git a/source/device/cpu/op/ceil/ceil_ref.c b/source/device/cpu/op/ceil/ceil_ref.c
index 0602ec448..95cc44f39 100644
--- a/source/device/cpu/op/ceil/ceil_ref.c
+++ b/source/device/cpu/op/ceil/ceil_ref.c
@@ -35,7 +35,6 @@
 
 #include <math.h>
 
-
 int ref_ceil_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     // dims size = 2 or 3
@@ -84,7 +83,7 @@ int ref_ceil_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int
 
 int ref_ceil_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
-     /* dequant */
+    /* dequant */
     uint8_t* input_uint8 = (uint8_t*)input_tensor->data;
     uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
     float input_scale = input_tensor->scale;
@@ -94,12 +93,12 @@ int ref_ceil_uint8(struct tensor* input_tensor, struct tensor* output_tensor, in
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* input_data = ( float* )sys_malloc(input_size * sizeof(float));
-    float* out_data = ( float* )sys_malloc(output_size * sizeof(float));
+    float* input_data = (float*)sys_malloc(input_size * sizeof(float));
+    float* out_data = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        input_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     // dims size = 2 or 3
@@ -180,7 +179,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_ceil_fp32(input_tensor, output_tensor, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_ceil_uint8(input_tensor, output_tensor, exec_graph->num_thread);
     else
         TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);
diff --git a/source/device/cpu/op/clip/clip_kernel_ref.h b/source/device/cpu/op/clip/clip_kernel_ref.h
index 9a2898884..efdd67877 100644
--- a/source/device/cpu/op/clip/clip_kernel_ref.h
+++ b/source/device/cpu/op/clip/clip_kernel_ref.h
@@ -25,12 +25,10 @@
 #ifndef __CLIP_KERNEL_REF_H__
 #define __CLIP_KERNEL_REF_H__
 
-
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int ref_clip_fp32(struct tensor* input_tensor, struct tensor* output_tensor, float max, float min);
 
 int ref_clip_uint8(struct tensor* input_tensor, struct tensor* output_tensor, float max, float min);
diff --git a/source/device/cpu/op/clip/clip_kernel_ref_fp32.c b/source/device/cpu/op/clip/clip_kernel_ref_fp32.c
index 53f688e40..ba2b46bad 100644
--- a/source/device/cpu/op/clip/clip_kernel_ref_fp32.c
+++ b/source/device/cpu/op/clip/clip_kernel_ref_fp32.c
@@ -36,7 +36,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 int ref_clip_fp32(struct tensor* input_tensor, struct tensor* output_tensor, float max, float min)
 {
     int total_size = input_tensor->elem_num;
diff --git a/source/device/cpu/op/clip/clip_kernel_ref_uint8.c b/source/device/cpu/op/clip/clip_kernel_ref_uint8.c
index 064954335..c7f33a538 100644
--- a/source/device/cpu/op/clip/clip_kernel_ref_uint8.c
+++ b/source/device/cpu/op/clip/clip_kernel_ref_uint8.c
@@ -38,12 +38,11 @@
 
 #include <math.h>
 
-
 int ref_clip_uint8(struct tensor* input_tensor, struct tensor* output_tensor, float max, float min)
 {
     int total_size = input_tensor->elem_num;
-    uint8_t* input_uint8 = ( uint8_t* )input_tensor->data;
-    uint8_t* output_uint8 = ( uint8_t* )output_tensor->data;
+    uint8_t* input_uint8 = (uint8_t*)input_tensor->data;
+    uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
 
     float input_scale = input_tensor->scale;
     float output_scale = output_tensor->scale;
@@ -51,11 +50,11 @@ int ref_clip_uint8(struct tensor* input_tensor, struct tensor* output_tensor, fl
     int output_zero = output_tensor->zero_point;
 
     /* input dequant */
-    float* input_fp32 = ( float* )sys_malloc(total_size * sizeof(float));
-    float* output_fp32 = ( float* )sys_malloc(total_size * sizeof(float));
+    float* input_fp32 = (float*)sys_malloc(total_size * sizeof(float));
+    float* output_fp32 = (float*)sys_malloc(total_size * sizeof(float));
 
     for (uint32_t i = 0; i < input_tensor->elem_num; i++)
-        input_fp32[i] = ((float )input_uint8[i] - (float )input_zero) * input_scale;
+        input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
 
     for (int i = 0; i < total_size; i++)
     {
@@ -75,7 +74,7 @@ int ref_clip_uint8(struct tensor* input_tensor, struct tensor* output_tensor, fl
     }
 
     sys_free(input_fp32);
-    sys_free(output_fp32); 
+    sys_free(output_fp32);
 
     return 0;
 }
diff --git a/source/device/cpu/op/clip/clip_ref.c b/source/device/cpu/op/clip/clip_ref.c
index 09c6c1c41..b29962c19 100644
--- a/source/device/cpu/op/clip/clip_ref.c
+++ b/source/device/cpu/op/clip/clip_ref.c
@@ -36,7 +36,6 @@
 
 #include "clip_kernel_ref.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct clip_param* clip_param = ( struct clip_param* )ir_node->op.param_mem;
+    struct clip_param* clip_param = (struct clip_param*)ir_node->op.param_mem;
 
     float max = clip_param->max;
     float min = clip_param->min;
diff --git a/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c b/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c
index 47a5c25f0..bfa3e4b70 100644
--- a/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c
+++ b/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c
@@ -24,7 +24,6 @@
 
 #include "comparison_kernel_ref.h"
 
-
 void comp_equal(int input_hw, int input_hw_1, int input_count4, int input1_count4, float* input0, float* input1,
                 p_comparison_param param, float* output)
 {
@@ -418,33 +417,39 @@ int ref_comparison_fp32(float* input0, float* input1, float* output, p_compariso
 
     switch (param->type)
     {
-        case 0: {
-            comp_equal(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
-            break;
-        }
-        case 1: {
-            comp_nequal(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
-            break;
-        }
-        case 2: {
-            comp_greater(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
-            break;
-        }
-        case 3: {
-            comp_greatere(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
-            break;
-        }
-        case 4: {
-            comp_less(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
-            break;
-        }
-        case 5: {
-            comp_lesse(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
-            break;
-        }
-        default:
-            return -1;
-            break;
+    case 0:
+    {
+        comp_equal(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
+        break;
+    }
+    case 1:
+    {
+        comp_nequal(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
+        break;
+    }
+    case 2:
+    {
+        comp_greater(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
+        break;
+    }
+    case 3:
+    {
+        comp_greatere(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
+        break;
+    }
+    case 4:
+    {
+        comp_less(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
+        break;
+    }
+    case 5:
+    {
+        comp_lesse(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, param, output);
+        break;
+    }
+    default:
+        return -1;
+        break;
     }
     return 0;
 }
diff --git a/source/device/cpu/op/comparison/comparison_ref.c b/source/device/cpu/op/comparison/comparison_ref.c
index b583e7252..14405732c 100644
--- a/source/device/cpu/op/comparison/comparison_ref.c
+++ b/source/device/cpu/op/comparison/comparison_ref.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -63,7 +62,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor1 = get_ir_graph_tensor(graph, node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct comparison_param* param = ( struct comparison_param* )node->op.param_mem;
+    struct comparison_param* param = (struct comparison_param*)node->op.param_mem;
 
     void* input0 = input_tensor->data;
     void* input1 = input_tensor1->data;
diff --git a/source/device/cpu/op/concat/concat_kernel_ref.h b/source/device/cpu/op/concat/concat_kernel_ref.h
index 2e0c71a2b..d078e1bc0 100644
--- a/source/device/cpu/op/concat/concat_kernel_ref.h
+++ b/source/device/cpu/op/concat/concat_kernel_ref.h
@@ -25,12 +25,10 @@
 #ifndef __CONCAT_KERNEL_REF_H__
 #define __CONCAT_KERNEL_REF_H__
 
-
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis);
 
 int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis);
diff --git a/source/device/cpu/op/concat/concat_kernel_ref_fp32.c b/source/device/cpu/op/concat/concat_kernel_ref_fp32.c
index 1d220cd18..d36733ea7 100644
--- a/source/device/cpu/op/concat/concat_kernel_ref_fp32.c
+++ b/source/device/cpu/op/concat/concat_kernel_ref_fp32.c
@@ -36,7 +36,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
 {
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
@@ -45,10 +44,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
     {
         struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 
-        float* input_data  = (float*)input_tensor->data;
+        float* input_data = (float*)input_tensor->data;
         float* output_data = (float*)output_tensor->data;
 
-        for(int i=0; i<input_tensor->elem_num; i++)
+        for (int i = 0; i < input_tensor->elem_num; i++)
             output_data[i] = input_data[i];
 
         return 0;
@@ -67,10 +66,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
 
             int size = input_tensor->elem_num;
 
-            float* input_data  = (float*)input_tensor->data;
+            float* input_data = (float*)input_tensor->data;
             float* output_data = (float*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
                 output_data[i] = input_data[i];
             }
@@ -85,14 +84,14 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
         int output_step = 0;
         for (int num = 0; num < ir_node->input_num; num++)
         {
-            struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
+            struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
             int size = input_tensor->elem_num;
 
-            float* input_data  = (float*)input_tensor->data;
+            float* input_data = (float*)input_tensor->data;
             float* output_data = (float*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
                 output_data[i] = input_data[i];
             }
@@ -106,7 +105,7 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_n = output_tensor->dims[0];
         int out_w = output_tensor->dims[1];
 
-        for (int n=0; n<output_tensor->dims[0]; n++)
+        for (int n = 0; n < output_tensor->dims[0]; n++)
         {
             int output_step = 0;
             for (int num = 0; num < ir_node->input_num; num++)
@@ -116,10 +115,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
                 int in_n = input_tensor->dims[0];
                 int in_w = input_tensor->dims[1];
 
-                float* input_data  = (float*)input_tensor->data  + n * in_w;
+                float* input_data = (float*)input_tensor->data + n * in_w;
                 float* output_data = (float*)output_tensor->data + n * out_w + output_step;
 
-                for (int i=0; i<in_w; i++)
+                for (int i = 0; i < in_w; i++)
                 {
                     output_data[i] = input_data[i];
                 }
@@ -135,14 +134,14 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
         int output_step = 0;
         for (int num = 0; num < ir_node->input_num; num++)
         {
-            struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
+            struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
             int size = input_tensor->elem_num;
 
-            float* input_data  = (float*)input_tensor->data;
+            float* input_data = (float*)input_tensor->data;
             float* output_data = (float*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
                 output_data[i] = input_data[i];
             }
@@ -158,7 +157,7 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_w = output_tensor->dims[2];
         int out_nstep = out_h * out_w;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
             int output_step = 0;
             for (int num = 0; num < ir_node->input_num; num++)
@@ -170,10 +169,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
                 int in_w = input_tensor->dims[2];
                 int in_nstep = in_h * in_w;
 
-                float* input_data  = (float*)input_tensor->data  + n * in_nstep;
+                float* input_data = (float*)input_tensor->data + n * in_nstep;
                 float* output_data = (float*)output_tensor->data + n * out_nstep + output_step;
 
-                for (int i=0; i<in_nstep; i++)
+                for (int i = 0; i < in_nstep; i++)
                 {
                     output_data[i] = input_data[i];
                 }
@@ -190,9 +189,9 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_w = output_tensor->dims[2];
         int out_nstep = out_h * out_w;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
-            for (int h=0; h<out_h; h++)
+            for (int h = 0; h < out_h; h++)
             {
                 int output_step = 0;
                 for (int num = 0; num < ir_node->input_num; num++)
@@ -204,10 +203,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
                     int in_w = input_tensor->dims[2];
                     int in_nstep = in_h * in_w;
 
-                    float* input_data  = (float*)input_tensor->data  + n * in_nstep  + h * in_w;
+                    float* input_data = (float*)input_tensor->data + n * in_nstep + h * in_w;
                     float* output_data = (float*)output_tensor->data + n * out_nstep + h * out_w + output_step;
 
-                    for (int i=0; i<in_w; i++)
+                    for (int i = 0; i < in_w; i++)
                     {
                         output_data[i] = input_data[i];
                     }
@@ -224,14 +223,14 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
         int output_step = 0;
         for (int num = 0; num < ir_node->input_num; num++)
         {
-            struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
+            struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
             int size = input_tensor->elem_num;
 
-            float* input_data  = (float*)input_tensor->data;
+            float* input_data = (float*)input_tensor->data;
             float* output_data = (float*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
                 output_data[i] = input_data[i];
             }
@@ -249,7 +248,7 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_cstep = out_h * out_w;
         int out_nstep = out_c * out_cstep;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
             int output_step = 0;
             for (int num = 0; num < ir_node->input_num; num++)
@@ -263,10 +262,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
                 int in_cstep = in_h * in_w;
                 int in_nstep = in_c * in_cstep;
 
-                float* input_data  = (float*)input_tensor->data  + n * in_nstep;
+                float* input_data = (float*)input_tensor->data + n * in_nstep;
                 float* output_data = (float*)output_tensor->data + n * out_nstep + output_step;
 
-                for (int i=0; i<in_nstep; i++)
+                for (int i = 0; i < in_nstep; i++)
                 {
                     output_data[i] = input_data[i];
                 }
@@ -285,9 +284,9 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_cstep = out_h * out_w;
         int out_nstep = out_c * out_cstep;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
-            for (int c=0; c<out_c; c++)
+            for (int c = 0; c < out_c; c++)
             {
                 int output_step = 0;
                 for (int num = 0; num < ir_node->input_num; num++)
@@ -301,10 +300,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
                     int in_cstep = in_h * in_w;
                     int in_nstep = in_c * in_cstep;
 
-                    float* input_data  = (float*)input_tensor->data  + n * in_nstep  + c * in_cstep;
+                    float* input_data = (float*)input_tensor->data + n * in_nstep + c * in_cstep;
                     float* output_data = (float*)output_tensor->data + n * out_nstep + c * out_cstep + output_step;
 
-                    for (int i=0; i<in_cstep; i++)
+                    for (int i = 0; i < in_cstep; i++)
                     {
                         output_data[i] = input_data[i];
                     }
@@ -324,11 +323,11 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_cstep = out_h * out_w;
         int out_nstep = out_c * out_cstep;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
-            for (int c=0; c<out_c; c++)
+            for (int c = 0; c < out_c; c++)
             {
-                for (int h=0; h<out_h; h++)
+                for (int h = 0; h < out_h; h++)
                 {
                     int output_step = 0;
                     for (int num = 0; num < ir_node->input_num; num++)
@@ -342,10 +341,10 @@ int ref_concat_fp32(struct graph* ir_graph, struct node* ir_node, int axis)
                         int in_cstep = in_h * in_w;
                         int in_nstep = in_c * in_cstep;
 
-                        float* input_data  = (float*)input_tensor->data  + n * in_nstep  + c * in_cstep  + h * in_w;
+                        float* input_data = (float*)input_tensor->data + n * in_nstep + c * in_cstep + h * in_w;
                         float* output_data = (float*)output_tensor->data + n * out_nstep + c * out_cstep + h * out_w + output_step;
 
-                        for (int i=0; i<in_w; i++)
+                        for (int i = 0; i < in_w; i++)
                         {
                             output_data[i] = input_data[i];
                         }
diff --git a/source/device/cpu/op/concat/concat_kernel_ref_int8.c b/source/device/cpu/op/concat/concat_kernel_ref_int8.c
index 85ce450a3..d5b265847 100644
--- a/source/device/cpu/op/concat/concat_kernel_ref_int8.c
+++ b/source/device/cpu/op/concat/concat_kernel_ref_int8.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
 {
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
@@ -48,10 +47,10 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
     {
         struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 
-        int8_t* input_data  = (int8_t*)input_tensor->data;
+        int8_t* input_data = (int8_t*)input_tensor->data;
         int8_t* output_data = (int8_t*)output_tensor->data;
 
-        for(int i=0; i<input_tensor->elem_num; i++)
+        for (int i = 0; i < input_tensor->elem_num; i++)
             output_data[i] = input_data[i];
 
         return 0;
@@ -73,12 +72,12 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
 
             int size = input_tensor->elem_num;
 
-            int8_t* input_data  = (int8_t*)input_tensor->data;
+            int8_t* input_data = (int8_t*)input_tensor->data;
             int8_t* output_data = (int8_t*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
-                int idata = roundf(input_data[i] *  rescale);
+                int idata = roundf(input_data[i] * rescale);
                 if (idata > 127)
                     idata = 127;
                 else if (idata < -127)
@@ -96,19 +95,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
         int output_step = 0;
         for (int num = 0; num < ir_node->input_num; num++)
         {
-            struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
+            struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
             float intput_scale = input_tensor->scale;
             float rescale = intput_scale / output_scale;
 
             int size = input_tensor->elem_num;
 
-            int8_t* input_data  = (int8_t*)input_tensor->data;
+            int8_t* input_data = (int8_t*)input_tensor->data;
             int8_t* output_data = (int8_t*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
-                int idata = roundf(input_data[i] *  rescale);
+                int idata = roundf(input_data[i] * rescale);
                 if (idata > 127)
                     idata = 127;
                 else if (idata < -127)
@@ -125,7 +124,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_n = output_tensor->dims[0];
         int out_w = output_tensor->dims[1];
 
-        for (int n=0; n<output_tensor->dims[0]; n++)
+        for (int n = 0; n < output_tensor->dims[0]; n++)
         {
             int output_step = 0;
             for (int num = 0; num < ir_node->input_num; num++)
@@ -133,17 +132,17 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
                 struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
                 float intput_scale = input_tensor->scale;
-                float rescale = intput_scale / output_scale;                
+                float rescale = intput_scale / output_scale;
 
                 int in_n = input_tensor->dims[0];
                 int in_w = input_tensor->dims[1];
 
-                int8_t* input_data  = (int8_t*)input_tensor->data  + n * in_w;
+                int8_t* input_data = (int8_t*)input_tensor->data + n * in_w;
                 int8_t* output_data = (int8_t*)output_tensor->data + n * out_w + output_step;
 
-                for (int i=0; i<in_w; i++)
+                for (int i = 0; i < in_w; i++)
                 {
-                    int idata = roundf(input_data[i] *  rescale);
+                    int idata = roundf(input_data[i] * rescale);
                     if (idata > 127)
                         idata = 127;
                     else if (idata < -127)
@@ -162,19 +161,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
         int output_step = 0;
         for (int num = 0; num < ir_node->input_num; num++)
         {
-            struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
+            struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
             float intput_scale = input_tensor->scale;
-            float rescale = intput_scale / output_scale;            
+            float rescale = intput_scale / output_scale;
 
             int size = input_tensor->elem_num;
 
-            int8_t* input_data  = (int8_t*)input_tensor->data;
+            int8_t* input_data = (int8_t*)input_tensor->data;
             int8_t* output_data = (int8_t*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
-                int idata = roundf(input_data[i] *  rescale);
+                int idata = roundf(input_data[i] * rescale);
                 if (idata > 127)
                     idata = 127;
                 else if (idata < -127)
@@ -193,7 +192,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_w = output_tensor->dims[2];
         int out_nstep = out_h * out_w;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
             int output_step = 0;
             for (int num = 0; num < ir_node->input_num; num++)
@@ -201,19 +200,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
                 struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
                 float intput_scale = input_tensor->scale;
-                float rescale = intput_scale / output_scale;                
+                float rescale = intput_scale / output_scale;
 
                 int in_n = input_tensor->dims[0];
                 int in_h = input_tensor->dims[1];
                 int in_w = input_tensor->dims[2];
                 int in_nstep = in_h * in_w;
 
-                int8_t* input_data  = (int8_t*)input_tensor->data  + n * in_nstep;
+                int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep;
                 int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + output_step;
 
-                for (int i=0; i<in_nstep; i++)
+                for (int i = 0; i < in_nstep; i++)
                 {
-                    int idata = roundf(input_data[i] *  rescale);
+                    int idata = roundf(input_data[i] * rescale);
                     if (idata > 127)
                         idata = 127;
                     else if (idata < -127)
@@ -233,9 +232,9 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_w = output_tensor->dims[2];
         int out_nstep = out_h * out_w;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
-            for (int h=0; h<out_h; h++)
+            for (int h = 0; h < out_h; h++)
             {
                 int output_step = 0;
                 for (int num = 0; num < ir_node->input_num; num++)
@@ -243,19 +242,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
                     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
                     float intput_scale = input_tensor->scale;
-                    float rescale = intput_scale / output_scale;                    
+                    float rescale = intput_scale / output_scale;
 
                     int in_n = input_tensor->dims[0];
                     int in_h = input_tensor->dims[1];
                     int in_w = input_tensor->dims[2];
                     int in_nstep = in_h * in_w;
 
-                    int8_t* input_data  = (int8_t*)input_tensor->data  + n * in_nstep  + h * in_w;
+                    int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep + h * in_w;
                     int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + h * out_w + output_step;
 
-                    for (int i=0; i<in_w; i++)
+                    for (int i = 0; i < in_w; i++)
                     {
-                        int idata = roundf(input_data[i] *  rescale);
+                        int idata = roundf(input_data[i] * rescale);
                         if (idata > 127)
                             idata = 127;
                         else if (idata < -127)
@@ -275,19 +274,19 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
         int output_step = 0;
         for (int num = 0; num < ir_node->input_num; num++)
         {
-            struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
+            struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
             float intput_scale = input_tensor->scale;
-            float rescale = intput_scale / output_scale;            
+            float rescale = intput_scale / output_scale;
 
             int size = input_tensor->elem_num;
 
-            int8_t* input_data  = (int8_t*)input_tensor->data;
+            int8_t* input_data = (int8_t*)input_tensor->data;
             int8_t* output_data = (int8_t*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
-                int idata = roundf(input_data[i] *  rescale);
+                int idata = roundf(input_data[i] * rescale);
                 if (idata > 127)
                     idata = 127;
                 else if (idata < -127)
@@ -308,7 +307,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_cstep = out_h * out_w;
         int out_nstep = out_c * out_cstep;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
             int output_step = 0;
             for (int num = 0; num < ir_node->input_num; num++)
@@ -316,7 +315,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
                 struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
                 float intput_scale = input_tensor->scale;
-                float rescale = intput_scale / output_scale;                
+                float rescale = intput_scale / output_scale;
 
                 int in_n = input_tensor->dims[0];
                 int in_c = input_tensor->dims[1];
@@ -325,12 +324,12 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
                 int in_cstep = in_h * in_w;
                 int in_nstep = in_c * in_cstep;
 
-                int8_t* input_data  = (int8_t*)input_tensor->data  + n * in_nstep;
+                int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep;
                 int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + output_step;
 
-                for (int i=0; i<in_nstep; i++)
+                for (int i = 0; i < in_nstep; i++)
                 {
-                    int idata = roundf(input_data[i] *  rescale);
+                    int idata = roundf(input_data[i] * rescale);
                     if (idata > 127)
                         idata = 127;
                     else if (idata < -127)
@@ -352,9 +351,9 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_cstep = out_h * out_w;
         int out_nstep = out_c * out_cstep;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
-            for (int c=0; c<out_c; c++)
+            for (int c = 0; c < out_c; c++)
             {
                 int output_step = 0;
                 for (int num = 0; num < ir_node->input_num; num++)
@@ -362,7 +361,7 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
                     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
                     float intput_scale = input_tensor->scale;
-                    float rescale = intput_scale / output_scale;                    
+                    float rescale = intput_scale / output_scale;
 
                     int in_n = input_tensor->dims[0];
                     int in_c = input_tensor->dims[1];
@@ -371,12 +370,12 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
                     int in_cstep = in_h * in_w;
                     int in_nstep = in_c * in_cstep;
 
-                    int8_t* input_data  = (int8_t*)input_tensor->data  + n * in_nstep  + c * in_cstep;
+                    int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep + c * in_cstep;
                     int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + c * out_cstep + output_step;
 
-                    for (int i=0; i<in_cstep; i++)
+                    for (int i = 0; i < in_cstep; i++)
                     {
-                        int idata = roundf(input_data[i] *  rescale);
+                        int idata = roundf(input_data[i] * rescale);
                         if (idata > 127)
                             idata = 127;
                         else if (idata < -127)
@@ -399,11 +398,11 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_cstep = out_h * out_w;
         int out_nstep = out_c * out_cstep;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
-            for (int c=0; c<out_c; c++)
+            for (int c = 0; c < out_c; c++)
             {
-                for (int h=0; h<out_h; h++)
+                for (int h = 0; h < out_h; h++)
                 {
                     int output_step = 0;
                     for (int num = 0; num < ir_node->input_num; num++)
@@ -420,12 +419,12 @@ int ref_concat_int8(struct graph* ir_graph, struct node* ir_node, int axis)
                         int in_cstep = in_h * in_w;
                         int in_nstep = in_c * in_cstep;
 
-                        int8_t* input_data  = (int8_t*)input_tensor->data  + n * in_nstep  + c * in_cstep  + h * in_w;
+                        int8_t* input_data = (int8_t*)input_tensor->data + n * in_nstep + c * in_cstep + h * in_w;
                         int8_t* output_data = (int8_t*)output_tensor->data + n * out_nstep + c * out_cstep + h * out_w + output_step;
 
-                        for (int i=0; i<in_w; i++)
+                        for (int i = 0; i < in_w; i++)
                         {
-                            int idata = roundf(input_data[i] *  rescale);
+                            int idata = roundf(input_data[i] * rescale);
                             if (idata > 127)
                                 idata = 127;
                             else if (idata < -127)
diff --git a/source/device/cpu/op/concat/concat_kernel_ref_uint8.c b/source/device/cpu/op/concat/concat_kernel_ref_uint8.c
index 80c1b1599..68f13601d 100644
--- a/source/device/cpu/op/concat/concat_kernel_ref_uint8.c
+++ b/source/device/cpu/op/concat/concat_kernel_ref_uint8.c
@@ -38,23 +38,22 @@
 
 #include <math.h>
 
-
 int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
 {
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
     float output_scale = output_tensor->scale;
-    int output_zero = output_tensor->zero_point;  
+    int output_zero = output_tensor->zero_point;
 
     if (ir_node->input_num == 1)
     {
         struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 
-        uint8_t* input_data  = (uint8_t*)input_tensor->data;
+        uint8_t* input_data = (uint8_t*)input_tensor->data;
         uint8_t* output_data = (uint8_t*)output_tensor->data;
 
-        for(int i=0; i<input_tensor->elem_num; i++)
+        for (int i = 0; i < input_tensor->elem_num; i++)
             output_data[i] = input_data[i];
-            
+
         return 0;
     }
 
@@ -75,12 +74,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
 
             int size = input_tensor->elem_num;
 
-            uint8_t* input_data  = (uint8_t*)input_tensor->data;
+            uint8_t* input_data = (uint8_t*)input_tensor->data;
             uint8_t* output_data = (uint8_t*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
-                int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -98,7 +97,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
         int output_step = 0;
         for (int num = 0; num < ir_node->input_num; num++)
         {
-            struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
+            struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
             float intput_scale = input_tensor->scale;
             int intput_zero = input_tensor->zero_point;
@@ -106,12 +105,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
 
             int size = input_tensor->elem_num;
 
-            uint8_t* input_data  = (uint8_t*)input_tensor->data;
+            uint8_t* input_data = (uint8_t*)input_tensor->data;
             uint8_t* output_data = (uint8_t*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
-                int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -128,7 +127,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_n = output_tensor->dims[0];
         int out_w = output_tensor->dims[1];
 
-        for (int n=0; n<output_tensor->dims[0]; n++)
+        for (int n = 0; n < output_tensor->dims[0]; n++)
         {
             int output_step = 0;
             for (int num = 0; num < ir_node->input_num; num++)
@@ -137,17 +136,17 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
 
                 float intput_scale = input_tensor->scale;
                 int intput_zero = input_tensor->zero_point;
-                float rescale = intput_scale / output_scale;                
+                float rescale = intput_scale / output_scale;
 
                 int in_n = input_tensor->dims[0];
                 int in_w = input_tensor->dims[1];
 
-                uint8_t* input_data  = (uint8_t*)input_tensor->data  + n * in_w;
+                uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_w;
                 uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_w + output_step;
 
-                for (int i=0; i<in_w; i++)
+                for (int i = 0; i < in_w; i++)
                 {
-                    int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                    int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                     if (udata > 255)
                         udata = 255;
                     else if (udata < 0)
@@ -166,20 +165,20 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
         int output_step = 0;
         for (int num = 0; num < ir_node->input_num; num++)
         {
-            struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
+            struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
             float intput_scale = input_tensor->scale;
             int intput_zero = input_tensor->zero_point;
-            float rescale = intput_scale / output_scale;            
+            float rescale = intput_scale / output_scale;
 
             int size = input_tensor->elem_num;
 
-            uint8_t* input_data  = (uint8_t*)input_tensor->data;
+            uint8_t* input_data = (uint8_t*)input_tensor->data;
             uint8_t* output_data = (uint8_t*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
-                int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -198,7 +197,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_w = output_tensor->dims[2];
         int out_nstep = out_h * out_w;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
             int output_step = 0;
             for (int num = 0; num < ir_node->input_num; num++)
@@ -207,19 +206,19 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
 
                 float intput_scale = input_tensor->scale;
                 int intput_zero = input_tensor->zero_point;
-                float rescale = intput_scale / output_scale;                
+                float rescale = intput_scale / output_scale;
 
                 int in_n = input_tensor->dims[0];
                 int in_h = input_tensor->dims[1];
                 int in_w = input_tensor->dims[2];
                 int in_nstep = in_h * in_w;
 
-                uint8_t* input_data  = (uint8_t*)input_tensor->data  + n * in_nstep;
+                uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep;
                 uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + output_step;
 
-                for (int i=0; i<in_nstep; i++)
+                for (int i = 0; i < in_nstep; i++)
                 {
-                    int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                    int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                     if (udata > 255)
                         udata = 255;
                     else if (udata < 0)
@@ -239,9 +238,9 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_w = output_tensor->dims[2];
         int out_nstep = out_h * out_w;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
-            for (int h=0; h<out_h; h++)
+            for (int h = 0; h < out_h; h++)
             {
                 int output_step = 0;
                 for (int num = 0; num < ir_node->input_num; num++)
@@ -250,19 +249,19 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
 
                     float intput_scale = input_tensor->scale;
                     int intput_zero = input_tensor->zero_point;
-                    float rescale = intput_scale / output_scale;                    
+                    float rescale = intput_scale / output_scale;
 
                     int in_n = input_tensor->dims[0];
                     int in_h = input_tensor->dims[1];
                     int in_w = input_tensor->dims[2];
                     int in_nstep = in_h * in_w;
 
-                    uint8_t* input_data  = (uint8_t*)input_tensor->data  + n * in_nstep  + h * in_w;
+                    uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep + h * in_w;
                     uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + h * out_w + output_step;
 
-                    for (int i=0; i<in_w; i++)
+                    for (int i = 0; i < in_w; i++)
                     {
-                        int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                        int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                         if (udata > 255)
                             udata = 255;
                         else if (udata < 0)
@@ -282,20 +281,20 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
         int output_step = 0;
         for (int num = 0; num < ir_node->input_num; num++)
         {
-            struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
+            struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[num]);
 
             float intput_scale = input_tensor->scale;
             int intput_zero = input_tensor->zero_point;
-            float rescale = intput_scale / output_scale;            
+            float rescale = intput_scale / output_scale;
 
             int size = input_tensor->elem_num;
 
-            uint8_t* input_data  = (uint8_t*)input_tensor->data;
+            uint8_t* input_data = (uint8_t*)input_tensor->data;
             uint8_t* output_data = (uint8_t*)output_tensor->data + output_step;
 
-            for (int i=0; i<size; i++)
+            for (int i = 0; i < size; i++)
             {
-                int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                 if (udata > 255)
                     udata = 255;
                 else if (udata < 0)
@@ -316,7 +315,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_cstep = out_h * out_w;
         int out_nstep = out_c * out_cstep;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
             int output_step = 0;
             for (int num = 0; num < ir_node->input_num; num++)
@@ -325,7 +324,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
 
                 float intput_scale = input_tensor->scale;
                 int intput_zero = input_tensor->zero_point;
-                float rescale = intput_scale / output_scale;                
+                float rescale = intput_scale / output_scale;
 
                 int in_n = input_tensor->dims[0];
                 int in_c = input_tensor->dims[1];
@@ -334,12 +333,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
                 int in_cstep = in_h * in_w;
                 int in_nstep = in_c * in_cstep;
 
-                uint8_t* input_data  = (uint8_t*)input_tensor->data  + n * in_nstep;
+                uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep;
                 uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + output_step;
 
-                for (int i=0; i<in_nstep; i++)
+                for (int i = 0; i < in_nstep; i++)
                 {
-                    int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                    int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                     if (udata > 255)
                         udata = 255;
                     else if (udata < 0)
@@ -361,9 +360,9 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_cstep = out_h * out_w;
         int out_nstep = out_c * out_cstep;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
-            for (int c=0; c<out_c; c++)
+            for (int c = 0; c < out_c; c++)
             {
                 int output_step = 0;
                 for (int num = 0; num < ir_node->input_num; num++)
@@ -372,7 +371,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
 
                     float intput_scale = input_tensor->scale;
                     int intput_zero = input_tensor->zero_point;
-                    float rescale = intput_scale / output_scale;                    
+                    float rescale = intput_scale / output_scale;
 
                     int in_n = input_tensor->dims[0];
                     int in_c = input_tensor->dims[1];
@@ -381,12 +380,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
                     int in_cstep = in_h * in_w;
                     int in_nstep = in_c * in_cstep;
 
-                    uint8_t* input_data  = (uint8_t*)input_tensor->data  + n * in_nstep  + c * in_cstep;
+                    uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep + c * in_cstep;
                     uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + c * out_cstep + output_step;
 
-                    for (int i=0; i<in_cstep; i++)
+                    for (int i = 0; i < in_cstep; i++)
                     {
-                        int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                        int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                         if (udata > 255)
                             udata = 255;
                         else if (udata < 0)
@@ -409,11 +408,11 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
         int out_cstep = out_h * out_w;
         int out_nstep = out_c * out_cstep;
 
-        for (int n=0; n<out_n; n++)
+        for (int n = 0; n < out_n; n++)
         {
-            for (int c=0; c<out_c; c++)
+            for (int c = 0; c < out_c; c++)
             {
-                for (int h=0; h<out_h; h++)
+                for (int h = 0; h < out_h; h++)
                 {
                     int output_step = 0;
                     for (int num = 0; num < ir_node->input_num; num++)
@@ -422,7 +421,7 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
 
                         float intput_scale = input_tensor->scale;
                         int intput_zero = input_tensor->zero_point;
-                        float rescale = intput_scale / output_scale;                        
+                        float rescale = intput_scale / output_scale;
 
                         int in_n = input_tensor->dims[0];
                         int in_c = input_tensor->dims[1];
@@ -431,12 +430,12 @@ int ref_concat_uint8(struct graph* ir_graph, struct node* ir_node, int axis)
                         int in_cstep = in_h * in_w;
                         int in_nstep = in_c * in_cstep;
 
-                        uint8_t* input_data  = (uint8_t*)input_tensor->data  + n * in_nstep  + c * in_cstep  + h * in_w;
+                        uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_nstep + c * in_cstep + h * in_w;
                         uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_nstep + c * out_cstep + h * out_w + output_step;
 
-                        for (int i=0; i<in_w; i++)
+                        for (int i = 0; i < in_w; i++)
                         {
-                            int udata = roundf((input_data[i] - intput_zero) *  rescale + output_zero);
+                            int udata = roundf((input_data[i] - intput_zero) * rescale + output_zero);
                             if (udata > 255)
                                 udata = 255;
                             else if (udata < 0)
diff --git a/source/device/cpu/op/concat/concat_ref.c b/source/device/cpu/op/concat/concat_ref.c
index 7630d4be0..b3b704f5f 100644
--- a/source/device/cpu/op/concat/concat_ref.c
+++ b/source/device/cpu/op/concat/concat_ref.c
@@ -38,7 +38,6 @@
 
 #include "concat_kernel_ref.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -54,8 +53,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct concat_param* concat_param = ( struct concat_param* )ir_node->op.param_mem;
-    
+    struct concat_param* concat_param = (struct concat_param*)ir_node->op.param_mem;
+
     int ret = -1;
     if (output_tensor->data_type == TENGINE_DT_FP32)
     {
@@ -81,13 +80,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops hcl_node_ops = {
-        .prerun = NULL,
-        .run = run,
-        .reshape = NULL,
-        .postrun = NULL,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score};
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score};
 
 int register_concat_ref_op()
 {
diff --git a/source/device/cpu/op/conv/conv_kernel_ref.h b/source/device/cpu/op/conv/conv_kernel_ref.h
index a3b49607c..c35c156c2 100644
--- a/source/device/cpu/op/conv/conv_kernel_ref.h
+++ b/source/device/cpu/op/conv/conv_kernel_ref.h
@@ -31,17 +31,16 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int ref_conv_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel,
-                         struct tensor* bias, struct conv_param* conv_param);
+                  struct tensor* bias, struct conv_param* conv_param);
 
 int ref_conv_fp16(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel,
-                         struct tensor* bias, struct conv_param* conv_param);
+                  struct tensor* bias, struct conv_param* conv_param);
 
 int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel,
-                         struct tensor* bias, struct conv_param* conv_param);
+                  struct tensor* bias, struct conv_param* conv_param);
 
 int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel,
-                         struct tensor* bias, struct conv_param* conv_param);
+                   struct tensor* bias, struct conv_param* conv_param);
 
 #endif
diff --git a/source/device/cpu/op/conv/conv_kernel_ref_fp16.c b/source/device/cpu/op/conv/conv_kernel_ref_fp16.c
index 26f6aa284..3e284d063 100644
--- a/source/device/cpu/op/conv/conv_kernel_ref_fp16.c
+++ b/source/device/cpu/op/conv/conv_kernel_ref_fp16.c
@@ -37,9 +37,8 @@
 
 #include "conv_kernel_ref.h"
 
-
 int ref_conv_fp16(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel,
-                         struct tensor* bias, struct conv_param* conv_param)
+                  struct tensor* bias, struct conv_param* conv_param)
 {
 #if MACOS
     TLOG_ERR("FP16 not support under mac os");
@@ -88,14 +87,11 @@ int ref_conv_fp16(struct tensor* input_tensor, struct tensor* output_tensor, str
                         float total = 0.f;
                         if (input_tensor->layout == 0)
                         {
-                            output_offset = n * group * output_c * output_h * output_w +
-                                            g * output_c * output_h * output_w + c * output_h * output_w +
-                                            h * output_w + w;
+                            output_offset = n * group * output_c * output_h * output_w + g * output_c * output_h * output_w + c * output_h * output_w + h * output_w + w;
                         }
                         else
                         {
-                            output_offset = n * group * output_c * output_h * output_w +
-                                            h * output_w * group * output_c + w * group * output_c + output_c * g + c;
+                            output_offset = n * group * output_c * output_h * output_w + h * output_w * group * output_c + w * group * output_c + output_c * g + c;
                         }
                         for (kc = 0; kc < input_c; ++kc)
                         {
@@ -111,25 +107,16 @@ int ref_conv_fp16(struct tensor* input_tensor, struct tensor* output_tensor, str
                                     {
                                         if (input_tensor->layout == 0)
                                         {
-                                            input_offset = n * group * input_c * input_h * input_w +
-                                                           g * input_c * input_h * input_w + kc * input_h * input_w +
-                                                           cur_y * input_w + cur_x;
-                                            kernel_offset = g * output_c * kernel_size + c * kernel_size +
-                                                            kc * conv_param->kernel_h * conv_param->kernel_w +
-                                                            kh * conv_param->kernel_w + kw;
+                                            input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + cur_y * input_w + cur_x;
+                                            kernel_offset = g * output_c * kernel_size + c * kernel_size + kc * conv_param->kernel_h * conv_param->kernel_w + kh * conv_param->kernel_w + kw;
                                         }
                                         else
                                         {
-                                            input_offset = n * group * input_c * input_h * input_w +
-                                                           cur_y * input_w * input_c * group + cur_x * input_c * group +
-                                                           g * input_c + kc;
-                                            kernel_offset = c * group * kernel_size +
-                                                            kh * conv_param->kernel_w * input_c * group +
-                                                            kw * input_c * group + g * input_c + kc;
+                                            input_offset = n * group * input_c * input_h * input_w + cur_y * input_w * input_c * group + cur_x * input_c * group + g * input_c + kc;
+                                            kernel_offset = c * group * kernel_size + kh * conv_param->kernel_w * input_c * group + kw * input_c * group + g * input_c + kc;
                                         }
 
-                                        total += fp16_to_fp32(input_data[input_offset]) *
-                                                 fp16_to_fp32(kernel_data[kernel_offset]);
+                                        total += fp16_to_fp32(input_data[input_offset]) * fp16_to_fp32(kernel_data[kernel_offset]);
                                     }
                                 }
                             }
diff --git a/source/device/cpu/op/conv/conv_kernel_ref_fp32.c b/source/device/cpu/op/conv/conv_kernel_ref_fp32.c
index ff331494f..692852f61 100644
--- a/source/device/cpu/op/conv/conv_kernel_ref_fp32.c
+++ b/source/device/cpu/op/conv/conv_kernel_ref_fp32.c
@@ -37,9 +37,8 @@
 
 #include "conv_kernel_ref.h"
 
-
 int ref_conv_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel,
-                         struct tensor* bias, struct conv_param* conv_param)
+                  struct tensor* bias, struct conv_param* conv_param)
 {
     int batch = input_tensor->dims[0];
     int group = conv_param->group;
@@ -84,9 +83,7 @@ int ref_conv_fp32(struct tensor* input_tensor, struct tensor* output_tensor, str
                         const int w_start = (w * conv_param->stride_w) - conv_param->pad_w0;
                         float total = 0.f;
 
-                        output_offset = n * group * output_c * output_h * output_w +
-                                        g * output_c * output_h * output_w + c * output_h * output_w +
-                                        h * output_w + w;
+                        output_offset = n * group * output_c * output_h * output_w + g * output_c * output_h * output_w + c * output_h * output_w + h * output_w + w;
 
                         for (kc = 0; kc < input_c; ++kc)
                         {
@@ -100,13 +97,8 @@ int ref_conv_fp32(struct tensor* input_tensor, struct tensor* output_tensor, str
                                     // use zero as a default value.
                                     if ((cur_x >= 0) && (cur_x < input_w) && (cur_y >= 0) && (cur_y < input_h))
                                     {
-
-                                        input_offset = n * group * input_c * input_h * input_w +
-                                                        g * input_c * input_h * input_w + kc * input_h * input_w +
-                                                        cur_y * input_w + cur_x;
-                                        kernel_offset = g * output_c * kernel_size + c * kernel_size +
-                                                        kc * conv_param->kernel_h * conv_param->kernel_w +
-                                                        kh * conv_param->kernel_w + kw;
+                                        input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + cur_y * input_w + cur_x;
+                                        kernel_offset = g * output_c * kernel_size + c * kernel_size + kc * conv_param->kernel_h * conv_param->kernel_w + kh * conv_param->kernel_w + kw;
 
                                         total += input_data[input_offset] * kernel_data[kernel_offset];
                                     }
diff --git a/source/device/cpu/op/conv/conv_kernel_ref_int8.c b/source/device/cpu/op/conv/conv_kernel_ref_int8.c
index 80d6a7dab..ba27e50d6 100644
--- a/source/device/cpu/op/conv/conv_kernel_ref_int8.c
+++ b/source/device/cpu/op/conv/conv_kernel_ref_int8.c
@@ -39,9 +39,8 @@
 
 #include <math.h>
 
-
 int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel,
-                         struct tensor* bias, struct conv_param* conv_param)
+                  struct tensor* bias, struct conv_param* conv_param)
 {
     int batch = input_tensor->dims[0];
     int group = conv_param->group;
@@ -71,9 +70,9 @@ int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, str
 
     /* input and kernel scales */
     int dequant_scales_size = group * output_c;
-    float *dequant_scales = (float*)malloc(sizeof(float) * dequant_scales_size);
+    float* dequant_scales = (float*)malloc(sizeof(float) * dequant_scales_size);
 
-    for(int i = 0; i < dequant_scales_size; i++)
+    for (int i = 0; i < dequant_scales_size; i++)
     {
         dequant_scales[i] = (input_scale * kernel_scales[i]);
     }
@@ -100,14 +99,11 @@ int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, str
                         int32_t total_i32 = 0;
                         if (input_tensor->layout == 0)
                         {
-                            output_offset = n * group * output_c * output_h * output_w +
-                                            g * output_c * output_h * output_w + c * output_h * output_w +
-                                            h * output_w + w;
+                            output_offset = n * group * output_c * output_h * output_w + g * output_c * output_h * output_w + c * output_h * output_w + h * output_w + w;
                         }
                         else
                         {
-                            output_offset = n * group * output_c * output_h * output_w +
-                                            h * output_w * group * output_c + w * group * output_c + output_c * g + c;
+                            output_offset = n * group * output_c * output_h * output_w + h * output_w * group * output_c + w * group * output_c + output_c * g + c;
                         }
                         for (kc = 0; kc < input_c; ++kc)
                         {
@@ -123,21 +119,13 @@ int ref_conv_int8(struct tensor* input_tensor, struct tensor* output_tensor, str
                                     {
                                         if (input_tensor->layout == 0)
                                         {
-                                            input_offset = n * group * input_c * input_h * input_w +
-                                                           g * input_c * input_h * input_w + kc * input_h * input_w +
-                                                           cur_y * input_w + cur_x;
-                                            kernel_offset = g * output_c * kernel_size + c * kernel_size +
-                                                            kc * conv_param->kernel_h * conv_param->kernel_w +
-                                                            kh * conv_param->kernel_w + kw;
+                                            input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + cur_y * input_w + cur_x;
+                                            kernel_offset = g * output_c * kernel_size + c * kernel_size + kc * conv_param->kernel_h * conv_param->kernel_w + kh * conv_param->kernel_w + kw;
                                         }
                                         else
                                         {
-                                            input_offset = n * group * input_c * input_h * input_w +
-                                                           cur_y * input_w * input_c * group + cur_x * input_c * group +
-                                                           g * input_c + kc;
-                                            kernel_offset = c * group * kernel_size +
-                                                            kh * conv_param->kernel_w * input_c * group +
-                                                            kw * input_c * group + g * input_c + kc;
+                                            input_offset = n * group * input_c * input_h * input_w + cur_y * input_w * input_c * group + cur_x * input_c * group + g * input_c + kc;
+                                            kernel_offset = c * group * kernel_size + kh * conv_param->kernel_w * input_c * group + kw * input_c * group + g * input_c + kc;
                                         }
 
                                         total_i32 += (int32_t)input_i8[input_offset] * (int32_t)kernel_i8[kernel_offset];
diff --git a/source/device/cpu/op/conv/conv_kernel_ref_uint8.c b/source/device/cpu/op/conv/conv_kernel_ref_uint8.c
index c236fa84a..376f15ad3 100644
--- a/source/device/cpu/op/conv/conv_kernel_ref_uint8.c
+++ b/source/device/cpu/op/conv/conv_kernel_ref_uint8.c
@@ -39,9 +39,8 @@
 
 #include <math.h>
 
-
 int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* kernel,
-                          struct tensor* bias, struct conv_param* conv_param)
+                   struct tensor* bias, struct conv_param* conv_param)
 {
     int batch = input_tensor->dims[0];
     int group = conv_param->group;
@@ -74,15 +73,15 @@ int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, st
 
     /* dequant input  */
     int input_size = batch * group * input_c * input_h * input_w;
-    float* input_fp32 = ( float* )sys_malloc(sizeof(float) * input_size);
+    float* input_fp32 = (float*)sys_malloc(sizeof(float) * input_size);
     for (int i = 0; i < input_size; i++)
-        input_fp32[i] = (( float )input_data[i] - input_zero) * input_scale;
+        input_fp32[i] = ((float)input_data[i] - input_zero) * input_scale;
 
     /* dequant kernel  */
     int kernel_total = group * output_c * kernel_size;
-    float* kernel_fp32 = ( float* )sys_malloc(sizeof(float) * kernel_total);
+    float* kernel_fp32 = (float*)sys_malloc(sizeof(float) * kernel_total);
     for (int i = 0; i < kernel_total; i++)
-        kernel_fp32[i] = (( float )kernel_data[i] - kernel_zero) * kernel_scale;
+        kernel_fp32[i] = ((float)kernel_data[i] - kernel_zero) * kernel_scale;
 
     /* dequant biases  */
     int bias_size = group * output_c;
@@ -90,9 +89,9 @@ int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, st
     float* bias_fp32 = NULL;
     if (bias != NULL)
     {
-        bias_fp32 = ( float* )sys_malloc(sizeof(float) * bias_size);
+        bias_fp32 = (float*)sys_malloc(sizeof(float) * bias_size);
         for (int i = 0; i < bias_size; i++)
-            bias_fp32[i] = ( float )bias_data[i] * input_scale * kernel_scale;
+            bias_fp32[i] = (float)bias_data[i] * input_scale * kernel_scale;
     }
 
     if (conv_param->kernel_h == 0)
@@ -117,14 +116,11 @@ int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, st
                         float total = 0.f;
                         if (input_tensor->layout == 0)
                         {
-                            output_offset = n * group * output_c * output_h * output_w +
-                                            g * output_c * output_h * output_w + c * output_h * output_w +
-                                            h * output_w + w;
+                            output_offset = n * group * output_c * output_h * output_w + g * output_c * output_h * output_w + c * output_h * output_w + h * output_w + w;
                         }
                         else
                         {
-                            output_offset = n * group * output_c * output_h * output_w +
-                                            h * output_w * group * output_c + w * group * output_c + output_c * g + c;
+                            output_offset = n * group * output_c * output_h * output_w + h * output_w * group * output_c + w * group * output_c + output_c * g + c;
                         }
                         for (kc = 0; kc < input_c; ++kc)
                         {
@@ -140,21 +136,13 @@ int ref_conv_uint8(struct tensor* input_tensor, struct tensor* output_tensor, st
                                     {
                                         if (input_tensor->layout == 0)
                                         {
-                                            input_offset = n * group * input_c * input_h * input_w +
-                                                           g * input_c * input_h * input_w + kc * input_h * input_w +
-                                                           cur_y * input_w + cur_x;
-                                            kernel_offset = g * output_c * kernel_size + c * kernel_size +
-                                                            kc * conv_param->kernel_h * conv_param->kernel_w +
-                                                            kh * conv_param->kernel_w + kw;
+                                            input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + cur_y * input_w + cur_x;
+                                            kernel_offset = g * output_c * kernel_size + c * kernel_size + kc * conv_param->kernel_h * conv_param->kernel_w + kh * conv_param->kernel_w + kw;
                                         }
                                         else
                                         {
-                                            input_offset = n * group * input_c * input_h * input_w +
-                                                           cur_y * input_w * input_c * group + cur_x * input_c * group +
-                                                           g * input_c + kc;
-                                            kernel_offset = c * group * kernel_size +
-                                                            kh * conv_param->kernel_w * input_c * group +
-                                                            kw * input_c * group + g * input_c + kc;
+                                            input_offset = n * group * input_c * input_h * input_w + cur_y * input_w * input_c * group + cur_x * input_c * group + g * input_c + kc;
+                                            kernel_offset = c * group * kernel_size + kh * conv_param->kernel_w * input_c * group + kw * input_c * group + g * input_c + kc;
                                         }
 
                                         total += input_fp32[input_offset] * kernel_fp32[kernel_offset];
diff --git a/source/device/cpu/op/conv/conv_ref.c b/source/device/cpu/op/conv/conv_ref.c
index 3f403aa1f..8f655f580 100644
--- a/source/device/cpu/op/conv/conv_ref.c
+++ b/source/device/cpu/op/conv/conv_ref.c
@@ -37,7 +37,6 @@
 
 #include "conv_kernel_ref.h"
 
-
 // add conv op by wangxinwei for debug conv
 //======================================================================================================//
 
@@ -57,7 +56,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     }
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     int ret = 0;
     if (input_tensor->data_type == TENGINE_DT_FP32)
@@ -86,12 +85,12 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     /* dynamic get the shape of output tensor */
     int n = input_tensor->dims[0];
     int h = input_tensor->dims[2];
-    int w = input_tensor->dims[3]; 
+    int w = input_tensor->dims[3];
     int ret = 0;
 
     if (conv_param->kernel_w == 0)
@@ -133,10 +132,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     }
     else
     {
-        out_h =
-                (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) /
-                conv_param->stride_h +
-                1;
+        out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / conv_param->stride_h + 1;
     }
 
     if (conv_param->pad_w0 < 0)
@@ -159,10 +155,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     }
     else
     {
-        out_w =
-                (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) /
-                conv_param->stride_w +
-                1;
+        out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / conv_param->stride_w + 1;
     }
 
     int dims[4];
@@ -207,12 +200,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops hcl_node_ops = {.prerun = NULL,
-        .run = run,
-        .reshape = reshape,
-        .postrun = NULL,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score};
+                                       .run = run,
+                                       .reshape = reshape,
+                                       .postrun = NULL,
+                                       .init_node = init_node,
+                                       .release_node = release_node,
+                                       .score = score};
 
 int register_conv_ref_op()
 {
diff --git a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.c b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.c
index 3c3854143..db451322f 100644
--- a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.c
+++ b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.c
@@ -37,7 +37,7 @@ void dw_k3s2_fp16_relu_fused_a76(__fp16* bias, __fp16* input, __fp16* kernel, __
 void dw_k3s2_fp16_relu6_fused_a76(__fp16* bias, __fp16* input, __fp16* kernel, __fp16* output, long channel_number, long input_w, long input_h, long pad0);
 
 int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                struct tensor* output_tensor, struct conv_param* param, int num_thread, int cpu_affinity)
+                     struct tensor* output_tensor, struct conv_param* param, int num_thread, int cpu_affinity)
 {
     /* param */
     int pads[4];
@@ -79,7 +79,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor,
     if (bias_tensor)
         bias_buf = bias_tensor->data;
 
-    for (int n = 0; n < batch; n++)    // batch size
+    for (int n = 0; n < batch; n++) // batch size
     {
         __fp16* input = input_buf + n * input_size * group;
         __fp16* output = output_buf + n * output_size * group;
@@ -92,7 +92,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor,
         {
             if (activation == 0)
             {
-                #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
                 for (int i = 0; i < group; i++)
                 {
                     __fp16* cur_input = input + i * channel_size;
@@ -106,7 +106,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor,
             }
             else if (activation > 0)
             {
-                #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
                 for (int i = 0; i < group; i++)
                 {
                     __fp16* cur_input = input + i * channel_size;
@@ -120,7 +120,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor,
             }
             else
             {
-                #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
                 for (int i = 0; i < group; i++)
                 {
                     __fp16* cur_input = input + i * channel_size;
@@ -137,7 +137,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor,
         {
             if (activation == 0)
             {
-                #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
                 for (int i = 0; i < group; i++)
                 {
                     __fp16* cur_input = input + i * channel_size;
@@ -151,7 +151,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor,
             }
             else if (activation > 0)
             {
-                #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
                 for (int i = 0; i < group; i++)
                 {
                     __fp16* cur_input = input + i * channel_size;
@@ -165,7 +165,7 @@ int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor,
             }
             else
             {
-                #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
                 for (int i = 0; i < group; i++)
                 {
                     __fp16* cur_input = input + i * channel_size;
diff --git a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.h b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.h
index 052ffcc18..a07995f59 100644
--- a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.h
+++ b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_dw_kernel_fp16_arm82.h
@@ -28,7 +28,6 @@
 #include "convolution_param.h"
 
 int conv_dw_fp16_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                struct tensor* output_tensor, struct conv_param* param, int num_thread, int cpu_affinity)
-   ;
+                     struct tensor* output_tensor, struct conv_param* param, int num_thread, int cpu_affinity);
 
 #endif
diff --git a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_kernel_fp16_arm82.c b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_kernel_fp16_arm82.c
index ca2698959..3e9a92944 100644
--- a/source/device/cpu/op/conv/cortex-a/armv8.2/conv_kernel_fp16_arm82.c
+++ b/source/device/cpu/op/conv/cortex-a/armv8.2/conv_kernel_fp16_arm82.c
@@ -33,12 +33,12 @@
 #define PER_OUT_CHAN 16
 
 void hgemm_4x16_a76(__fp16* biases, __fp16* input, __fp16* kernel, long kernel_size, __fp16* output,
-                               long output_xy, long fused_relu);
+                    long output_xy, long fused_relu);
 void hgemm_4x4_a76(__fp16* biases, __fp16* input, __fp16* kernel, long kernel_size, __fp16* output,
-                              long output_xy, long fused_relu);
+                   long output_xy, long fused_relu);
 
 void im2col_fp16_1x1(__fp16* input, long input_xy, __fp16* col, long col_cnt, long input_chan);
-void im2col_fp16_3x3(__fp16* input, long input_x, long input_y, long input_chan, __fp16* col, long stride);                         
+void im2col_fp16_3x3(__fp16* input, long input_x, long input_y, long input_chan, __fp16* col, long stride);
 
 void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y, int stride_x,
             int stride_y, int dilation_x, int dilation_y, int pad_w0, int pad_w1, int pad_h0, int pad_h1, int output_x,
@@ -51,7 +51,7 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
     __fp16* cur_col = col + col_start * kernel_size;
     int col_i, col_j, kch, ky, kx, i;
 
-    if((kernel_x == 1) && (kernel_y == 1) && (stride_x == 1) && (stride_y == 1))
+    if ((kernel_x == 1) && (kernel_y == 1) && (stride_x == 1) && (stride_y == 1))
     {
         {
             int col_cnt = (col_end & -4) - (col_start & -4);
@@ -60,13 +60,13 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
             col_i = col_end & -4;
         }
         // final 4 input
-        if(col_end & 0x3)
+        if (col_end & 0x3)
         {
-            for(col_j = 0; col_j < kernel_size; col_j++)
+            for (col_j = 0; col_j < kernel_size; col_j++)
             {
-                for(i = 0; i < 4; i++)
+                for (i = 0; i < 4; i++)
                 {
-                    if((col_i + i) < col_end)
+                    if ((col_i + i) < col_end)
                         *cur_col++ = *(im + input_xy * col_j + col_i + i);
                     else
                         *cur_col++ = 0.0;
@@ -74,22 +74,20 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
             }
         }
     }
-    else if((kernel_x == 3) && (kernel_y == 3) && (dilation_x == 1) && (dilation_y == 1))
+    else if ((kernel_x == 3) && (kernel_y == 3) && (dilation_x == 1) && (dilation_y == 1))
     {
-        
         int is_pad0 = (pad_w0 == 0) && (pad_h0 == 0) && (pad_w1 == 0) && (pad_h1 == 0);
-        for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        for (col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
         {
             cur_col = col + col_i * kernel_size;
             int imy0 = col_i / output_x;
             int imy3 = (col_i + 3) / output_x;
             int imx0 = col_i - imy0 * output_x;
             int imx3 = (col_i + 3) - imy3 * output_x;
-            if((imy0 == imy3) &&
-               (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (output_y - 1) && imx3 != (output_x - 1))))
+            if ((imy0 == imy3) && (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (output_y - 1) && imx3 != (output_x - 1))))
             {
                 __fp16* l0 = im + (imy0 * stride_y - pad_y) * input_x + (imx0 * stride_x - pad_x);
-               
+
                 {
                     im2col_fp16_3x3(l0, input_x, input_y, input_chan, cur_col, stride_x);
                     cur_col += 4 * kernel_size;
@@ -103,15 +101,15 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
                                     cnt_x[2] * stride_x - pad_x, cnt_x[3] * stride_x - pad_x};
                 int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y,
                                     cnt_y[2] * stride_y - pad_y, cnt_y[3] * stride_y - pad_y};
-                for(kch = 0; kch < input_chan; kch++)
-                    for(ky = 0; ky < 3; ky++)
-                        for(kx = 0; kx < 3; kx++)
+                for (kch = 0; kch < input_chan; kch++)
+                    for (ky = 0; ky < 3; ky++)
+                        for (kx = 0; kx < 3; kx++)
                         {
                             int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
                             int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                            for(i = 0; i < 4; i++)
+                            for (i = 0; i < 4; i++)
                             {
-                                if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                     *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                 else
                                     *cur_col++ = 0.0;
@@ -120,7 +118,7 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
             }
         }
         // final 4 input
-        if(col_end & 0x3)
+        if (col_end & 0x3)
         {
             int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x};
             int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1,
@@ -129,16 +127,15 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
                                 cnt_x[3] * stride_x - pad_x};
             int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y,
                                 cnt_y[3] * stride_y - pad_y};
-            for(kch = 0; kch < input_chan; kch++)
-                for(ky = 0; ky < 3; ky++)
-                    for(kx = 0; kx < 3; kx++)
+            for (kch = 0; kch < input_chan; kch++)
+                for (ky = 0; ky < 3; ky++)
+                    for (kx = 0; kx < 3; kx++)
                     {
                         int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
                         int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                        for(i = 0; i < 4; i++)
+                        for (i = 0; i < 4; i++)
                         {
-                            if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
-                               imy[i] < input_y)
+                            if ((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                             else
                                 *cur_col++ = 0.0;
@@ -147,8 +144,8 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
         }
     }
     else
-    {    // for general cases
-        for(col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+    { // for general cases
+        for (col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
         {
             int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x};
             int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1,
@@ -157,15 +154,15 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
                                 cnt_x[3] * stride_x - pad_x};
             int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y,
                                 cnt_y[3] * stride_y - pad_y};
-            for(kch = 0; kch < input_chan; kch++)
-                for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y)
-                    for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x)
+            for (kch = 0; kch < input_chan; kch++)
+                for (ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y)
+                    for (kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x)
                     {
                         int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
                         int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                        for(i = 0; i < 4; i++)
+                        for (i = 0; i < 4; i++)
                         {
-                            if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                            if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                             else
                                 *cur_col++ = 0.0;
@@ -173,7 +170,7 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
                     }
         }
         // final 4 input
-        if(col_end & 0x3)
+        if (col_end & 0x3)
         {
             int cnt_y[4] = {col_i / output_x, (col_i + 1) / output_x, (col_i + 2) / output_x, (col_i + 3) / output_x};
             int cnt_x[4] = {col_i - cnt_y[0] * output_x, col_i - cnt_y[1] * output_x + 1,
@@ -182,16 +179,15 @@ void im2col(__fp16* im, __fp16* col, int input_chan, int input_x, int input_y, i
                                 cnt_x[3] * stride_x - pad_x};
             int imy_start[4] = {cnt_y[0] * stride_y - pad_y, cnt_y[1] * stride_y - pad_y, cnt_y[2] * stride_y - pad_y,
                                 cnt_y[3] * stride_y - pad_y};
-            for(kch = 0; kch < input_chan; kch++)
-                for(ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y)
-                    for(kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x)
+            for (kch = 0; kch < input_chan; kch++)
+                for (ky = 0; ky < (kernel_y * dilation_y); ky += dilation_y)
+                    for (kx = 0; kx < (kernel_x * dilation_x); kx += dilation_x)
                     {
                         int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
                         int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                        for(i = 0; i < 4; i++)
+                        for (i = 0; i < 4; i++)
                         {
-                            if((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
-                               imy[i] < input_y)
+                            if ((col_i + i) < col_end && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                             else
                                 *cur_col++ = 0.0;
@@ -214,7 +210,7 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch
     __fp16* cur_kernel_interleaved = kernel_interleaved;
 
     // interleave 16 kernels
-    for(i = 0; i < (kernel_chan & -16); i += 16)
+    for (i = 0; i < (kernel_chan & -16); i += 16)
     {
         cur_kernel0 = kernel + kernel_size * i;
         cur_kernel1 = kernel + kernel_size * (i + 1);
@@ -232,7 +228,7 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch
         cur_kernel13 = kernel + kernel_size * (i + 13);
         cur_kernel14 = kernel + kernel_size * (i + 14);
         cur_kernel15 = kernel + kernel_size * (i + 15);
-        for(j = 0; j < kernel_size; j++)
+        for (j = 0; j < kernel_size; j++)
         {
             *(cur_kernel_interleaved++) = cur_kernel0[j];
             *(cur_kernel_interleaved++) = cur_kernel1[j];
@@ -253,13 +249,13 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch
         }
     }
 
-    for(i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4)
+    for (i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4)
     {
         cur_kernel0 = kernel + kernel_size * i;
         cur_kernel1 = kernel + kernel_size * (i + 1);
         cur_kernel2 = kernel + kernel_size * (i + 2);
         cur_kernel3 = kernel + kernel_size * (i + 3);
-        for(j = 0; j < kernel_size; j++)
+        for (j = 0; j < kernel_size; j++)
         {
             *(cur_kernel_interleaved++) = cur_kernel0[j];
             *(cur_kernel_interleaved++) = cur_kernel1[j];
@@ -271,9 +267,9 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch
     cur_kernel0 = kernel + kernel_size * i;
     cur_kernel1 = kernel + kernel_size * (i + 1);
     cur_kernel2 = kernel + kernel_size * (i + 2);
-    if((kernel_chan & 0x3) == 3)
+    if ((kernel_chan & 0x3) == 3)
     {
-        for(j = 0; j < kernel_size; j++)
+        for (j = 0; j < kernel_size; j++)
         {
             *(cur_kernel_interleaved++) = cur_kernel0[j];
             *(cur_kernel_interleaved++) = cur_kernel1[j];
@@ -281,9 +277,9 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch
             *(cur_kernel_interleaved++) = 0.0;
         }
     }
-    else if((kernel_chan & 0x3) == 2)
+    else if ((kernel_chan & 0x3) == 2)
     {
-        for(j = 0; j < kernel_size; j++)
+        for (j = 0; j < kernel_size; j++)
         {
             *(cur_kernel_interleaved++) = cur_kernel0[j];
             *(cur_kernel_interleaved++) = cur_kernel1[j];
@@ -291,9 +287,9 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch
             *(cur_kernel_interleaved++) = 0.0;
         }
     }
-    else if((kernel_chan & 0x3) == 1)
+    else if ((kernel_chan & 0x3) == 1)
     {
-        for(j = 0; j < kernel_size; j++)
+        for (j = 0; j < kernel_size; j++)
         {
             *(cur_kernel_interleaved++) = cur_kernel0[j];
             *(cur_kernel_interleaved++) = 0.0;
@@ -303,7 +299,7 @@ void interleave_kernel(__fp16* kernel, __fp16* kernel_interleaved, int kernel_ch
     }
 }
 
-static void interleave(struct tensor * filter, struct conv_priv_info*  priv_info, struct conv_param* param)
+static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param)
 {
     int group = param->group;
     int out_chan = filter->dims[0] / group;
@@ -313,9 +309,9 @@ static void interleave(struct tensor * filter, struct conv_priv_info*  priv_info
     int kernel_interleaved_size_g = kernel_size * ((out_chan + 3) & -4);
 
     __fp16* kernel = (__fp16*)filter->data;
-    
+
     __fp16* interleave_buf = (__fp16*)priv_info->interleave_buffer;
-    for(int g = 0; g < group; g++)
+    for (int g = 0; g < group; g++)
     {
         __fp16* cur_kernel = kernel + g * kernel_size_g;
         __fp16* cur_interleave = interleave_buf + g * kernel_interleaved_size_g;
@@ -327,33 +323,33 @@ static void hgemm_set(__fp16* col, __fp16* kernel, __fp16* biases, __fp16* outpu
                       int ch_start, int ch_end, int output_xy, int relu_fused, int num_thread, int cpu_affinity)
 {
     int nn_outch = ch_end / PER_OUT_CHAN;
-    int col_end3 = output_xy & 0x3;    
+    int col_end3 = output_xy & 0x3;
 
     if (col_end3)
     {
-        #pragma omp parallel for num_threads(num_thread)
-        for (int pp=0; pp<nn_outch; pp++)
+#pragma omp parallel for num_threads(num_thread)
+        for (int pp = 0; pp < nn_outch; pp++)
         {
             int p = pp * PER_OUT_CHAN;
 
-            __fp16* biasptr = biases ? (__fp16* )(biases + p) : NULL;
-            __fp16* kernel_tmp = (__fp16* )(kernel + p * kernel_size);
-            __fp16* output_tmp = (__fp16* )(output + p * output_xy);
+            __fp16* biasptr = biases ? (__fp16*)(biases + p) : NULL;
+            __fp16* kernel_tmp = (__fp16*)(kernel + p * kernel_size);
+            __fp16* output_tmp = (__fp16*)(output + p * output_xy);
 
             int col_line = 0;
-            for(col_line = 0; col_line + 3 < output_xy; col_line += 4)
+            for (col_line = 0; col_line + 3 < output_xy; col_line += 4)
             {
-                __fp16* col_tmp = ( __fp16* )(col + col_line * kernel_size);
+                __fp16* col_tmp = (__fp16*)(col + col_line * kernel_size);
                 hgemm_4x16_a76(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, relu_fused);
             }
             {
                 __fp16 result[64];
-                __fp16* col_tmp = ( __fp16* )(col + col_line * kernel_size);
+                __fp16* col_tmp = (__fp16*)(col + col_line * kernel_size);
                 hgemm_4x16_a76(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, relu_fused);
 
-                for(int i = 0; i < 16; i++)
+                for (int i = 0; i < 16; i++)
                 {
-                    for(int j = 0; j < (col_end3); j++)
+                    for (int j = 0; j < (col_end3); j++)
                         *(output + (p + i) * output_xy + col_line + j) = result[(i << 2) + j];
                 }
             }
@@ -361,18 +357,18 @@ static void hgemm_set(__fp16* col, __fp16* kernel, __fp16* biases, __fp16* outpu
     }
     else
     {
-        #pragma omp parallel for num_threads(num_thread)
-        for (int pp=0; pp<nn_outch; pp++)
+#pragma omp parallel for num_threads(num_thread)
+        for (int pp = 0; pp < nn_outch; pp++)
         {
             int p = pp * PER_OUT_CHAN;
 
-            __fp16* biasptr = biases ? (__fp16* )(biases + p) : NULL;
-            __fp16* kernel_tmp = (__fp16* )(kernel + p * kernel_size);
-            __fp16* output_tmp = (__fp16* )(output + p * output_xy);
+            __fp16* biasptr = biases ? (__fp16*)(biases + p) : NULL;
+            __fp16* kernel_tmp = (__fp16*)(kernel + p * kernel_size);
+            __fp16* output_tmp = (__fp16*)(output + p * output_xy);
 
-            for(int col_line = 0; col_line + 3 < output_xy; col_line += 4)
+            for (int col_line = 0; col_line + 3 < output_xy; col_line += 4)
             {
-                __fp16* col_tmp = (__fp16* )(col + col_line * kernel_size);
+                __fp16* col_tmp = (__fp16*)(col + col_line * kernel_size);
                 hgemm_4x16_a76(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, relu_fused);
             }
         }
@@ -390,91 +386,91 @@ static void hgemm4x4(__fp16* col, __fp16* kernel, __fp16* biases, __fp16* output
     int col_end3 = output_xy & 0x3;
     int kernel_end3 = ch_end & 0x3;
 
-    for(kernel_num = ch_start; kernel_num < (ch_end & -4); kernel_num += 4)
+    for (kernel_num = ch_start; kernel_num < (ch_end & -4); kernel_num += 4)
     {
-        if(biases)
+        if (biases)
             cur_biases = biases + kernel_num;
         cur_kernel = kernel + kernel_num * kernel_size;
         cur_output = output + kernel_num * output_xy;
-        for(col_line = 0; col_line < (output_xy & -4); col_line += 4)
+        for (col_line = 0; col_line < (output_xy & -4); col_line += 4)
         {
             cur_col = col + col_line * kernel_size;
             hgemm_4x4_a76(cur_biases, cur_col, cur_kernel, kernel_size, cur_output + col_line, output_xy, relu_fused);
         }
-        if(col_end3)
+        if (col_end3)
         {
             cur_col = col + col_line * kernel_size;
             hgemm_4x4_a76(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, relu_fused);
 
-            for(i = 0; i < 4; i++)
+            for (i = 0; i < 4; i++)
             {
-                for(j = 0; j < (col_end3); j++)
+                for (j = 0; j < (col_end3); j++)
                     *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
             }
         }
     }
-    if(kernel_end3)
+    if (kernel_end3)
     {
-        if(biases)
+        if (biases)
             cur_biases = biases + kernel_num;
         cur_kernel = kernel + kernel_num * kernel_size;
-        for(col_line = 0; col_line < (output_xy & -4); col_line += 4)
+        for (col_line = 0; col_line < (output_xy & -4); col_line += 4)
         {
             cur_col = col + col_line * kernel_size;
             hgemm_4x4_a76(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, relu_fused);
 
-            for(i = 0; i < kernel_end3; i++)
-                for(j = 0; j < 4; j++)
+            for (i = 0; i < kernel_end3; i++)
+                for (j = 0; j < 4; j++)
                     *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
         }
-        if(col_end3)
+        if (col_end3)
         {
             cur_col = col + col_line * kernel_size;
             hgemm_4x4_a76(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, relu_fused);
 
-            for(i = 0; i < (kernel_end3); i++)
+            for (i = 0; i < (kernel_end3); i++)
             {
-                for(j = 0; j < (col_end3); j++)
+                for (j = 0; j < (col_end3); j++)
                     *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
             }
         }
     }
 }
 
-int fp16_conv_hcl_get_shared_mem_size(struct tensor*  input , \
-                                 struct tensor*  output , \
-                                 struct conv_param* param)
+int fp16_conv_hcl_get_shared_mem_size(struct tensor* input,
+                                      struct tensor* output,
+                                      struct conv_param* param)
 {
     int group = param->group;
     int input_chan = param->input_channel / group;
     int kernel_size = input_chan * param->kernel_h * param->kernel_w;
-    
+
     int output_xy = output->dims[2] * output->dims[3];
     int mem_size = sizeof(__fp16) * kernel_size * ((output_xy + 3) & -4) + 128;
 
     return mem_size;
 }
 
-static int get_private_mem_size(struct tensor * filter, struct conv_param* param)
+static int get_private_mem_size(struct tensor* filter, struct conv_param* param)
 {
     int group = param->group;
     int out_chan = filter->dims[0] / group;
     int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3];
-    
+
     int mem_size = sizeof(__fp16) * kernel_size * ((out_chan + 3) & -4) * group + 128;
 
     return mem_size;
 }
 
-int fp16_conv_hcl_prerun(struct tensor*  input_tensor , \
-                    struct tensor*  filter_tensor ,  \
-                    struct tensor*  output_tensor , \
-                    struct conv_priv_info*  priv_info , \
-                    struct conv_param* param)
+int fp16_conv_hcl_prerun(struct tensor* input_tensor,
+                         struct tensor* filter_tensor,
+                         struct tensor* output_tensor,
+                         struct conv_priv_info* priv_info,
+                         struct conv_param* param)
 {
     if (!priv_info->external_im2col_mem)
     {
-        int mem_size = fp16_conv_hcl_get_shared_mem_size(input_tensor , output_tensor , param);
+        int mem_size = fp16_conv_hcl_get_shared_mem_size(input_tensor, output_tensor, param);
         void* mem = sys_malloc(mem_size);
         priv_info->im2col_buffer = mem;
         priv_info->im2col_buffer_size = mem_size;
@@ -493,15 +489,15 @@ int fp16_conv_hcl_prerun(struct tensor*  input_tensor , \
     return 0;
 }
 
-int fp16_conv_hcl_postrun(struct conv_priv_info*  priv_info)
+int fp16_conv_hcl_postrun(struct conv_priv_info* priv_info)
 {
-    if(!priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL)
+    if (!priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL)
     {
         sys_free(priv_info->interleave_buffer);
         priv_info->interleave_buffer = NULL;
     }
 
-    if(!priv_info->external_im2col_mem && priv_info->im2col_buffer != NULL)
+    if (!priv_info->external_im2col_mem && priv_info->im2col_buffer != NULL)
     {
         sys_free(priv_info->im2col_buffer);
         priv_info->im2col_buffer = NULL;
@@ -510,13 +506,13 @@ int fp16_conv_hcl_postrun(struct conv_priv_info*  priv_info)
     return 0;
 }
 
-int fp16_conv_hcl_run(struct tensor* input_tensor , \
-                    struct tensor* filter_tensor , \
-                    struct tensor* bias_tensor ,  \
-                    struct tensor* output_tensor , \
-                    struct conv_priv_info* priv_info , \
-                    struct conv_param* param, \
-                    int num_thread, int cpu_affinity)
+int fp16_conv_hcl_run(struct tensor* input_tensor,
+                      struct tensor* filter_tensor,
+                      struct tensor* bias_tensor,
+                      struct tensor* output_tensor,
+                      struct conv_priv_info* priv_info,
+                      struct conv_param* param,
+                      int num_thread, int cpu_affinity)
 {
     /* param */
     // TLOG_ERR("run into fp16_conv_hcl_run!\n");
@@ -558,23 +554,23 @@ int fp16_conv_hcl_run(struct tensor* input_tensor , \
 
     int sgemm_set_chan = out_c / PER_OUT_CHAN * PER_OUT_CHAN;
     int sgemm_set_remain = out_c % PER_OUT_CHAN;
-    for(int n = 0; n < batch; n++) // batch size
+    for (int n = 0; n < batch; n++) // batch size
     {
-        for(int g = 0; g < group; g++)
+        for (int g = 0; g < group; g++)
         {
             /* im2col */
-            __fp16* cur_input = input_buf + (n * group + g) *input_size;
+            __fp16* cur_input = input_buf + (n * group + g) * input_size;
 
             im2col(cur_input, col_buf, in_c, in_w, in_h, kernel_w, kernel_h,
-                    stride_w, stride_h, dilation_w, dilation_h, pad_w0, pad_w1, pad_h0, pad_h1,
-                    out_w, out_h, 0, out_hw);
+                   stride_w, stride_h, dilation_w, dilation_h, pad_w0, pad_w1, pad_h0, pad_h1,
+                   out_w, out_h, 0, out_hw);
 
             /* gemm */
             __fp16* cur_kernel = interleave_buf + g * (kernel_size * ((out_c + 3) & -4));
             __fp16* cur_output = output_buf + (n * group + g) * output_size;
-            __fp16* cur_bias = biases_buf? (biases_buf + g * out_c) : NULL;
+            __fp16* cur_bias = biases_buf ? (biases_buf + g * out_c) : NULL;
             hgemm_set(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, 0, sgemm_set_chan, out_hw, fused_relu, num_thread, cpu_affinity);
-            if(sgemm_set_remain)
+            if (sgemm_set_remain)
             {
                 hgemm4x4(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, sgemm_set_chan, out_c, out_hw, fused_relu, num_thread, cpu_affinity);
             }
diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_dilation_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/conv_dw_dilation_kernel_arm.h
index dad13a6f8..f4d93b091 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_dw_dilation_kernel_arm.h
+++ b/source/device/cpu/op/conv/cortex-a/conv_dw_dilation_kernel_arm.h
@@ -65,8 +65,7 @@ int conv_dw_dilation_run(float* input_buf, float* weight_buf, float* bias, float
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[5]), vld1q_f32(input_buf_c + h * input_w + w + pad));
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[6]),
                                   vld1q_f32(input_buf_c + (h + pad) * input_w + w - pad));
-                tmp_4 =
-                    vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[7]), vld1q_f32(input_buf_c + (h + pad) * input_w + w));
+                tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[7]), vld1q_f32(input_buf_c + (h + pad) * input_w + w));
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[8]),
                                   vld1q_f32(input_buf_c + (h + pad) * input_w + w + pad));
                 tmp_4 = vector_activation(tmp_4, activation);
@@ -115,8 +114,7 @@ int conv_dw_dilation_run(float* input_buf, float* weight_buf, float* bias, float
 
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[0]),
                                   vld1q_f32(input_buf_c + (h - pad) * input_w + w - pad));
-                tmp_4 =
-                    vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[1]), vld1q_f32(input_buf_c + (h - pad) * input_w + w));
+                tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[1]), vld1q_f32(input_buf_c + (h - pad) * input_w + w));
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[2]),
                                   vld1q_f32(input_buf_c + (h - pad) * input_w + w + pad));
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[3]), vld1q_f32(input_buf_c + h * input_w + w - pad));
@@ -124,8 +122,7 @@ int conv_dw_dilation_run(float* input_buf, float* weight_buf, float* bias, float
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[5]), vld1q_f32(input_buf_c + h * input_w + w + pad));
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[6]),
                                   vld1q_f32(input_buf_c + (h + pad) * input_w + w - pad));
-                tmp_4 =
-                    vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[7]), vld1q_f32(input_buf_c + (h + pad) * input_w + w));
+                tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[7]), vld1q_f32(input_buf_c + (h + pad) * input_w + w));
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[8]),
                                   vld1q_f32(input_buf_c + (h + pad) * input_w + w + pad));
                 tmp_4 = vector_activation(tmp_4, activation);
@@ -177,8 +174,7 @@ int conv_dw_dilation_run(float* input_buf, float* weight_buf, float* bias, float
 
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[0]),
                                   vld1q_f32(input_buf_c + (h - pad) * input_w + w - pad));
-                tmp_4 =
-                    vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[1]), vld1q_f32(input_buf_c + (h - pad) * input_w + w));
+                tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[1]), vld1q_f32(input_buf_c + (h - pad) * input_w + w));
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[2]),
                                   vld1q_f32(input_buf_c + (h - pad) * input_w + w + pad));
                 tmp_4 = vmlaq_f32(tmp_4, vdupq_n_f32(weight_buf_c[3]), vld1q_f32(input_buf_c + h * input_w + w - pad));
diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_hcl_arm.c b/source/device/cpu/op/conv/cortex-a/conv_dw_hcl_arm.c
index a59389549..cd18faf5d 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_dw_hcl_arm.c
+++ b/source/device/cpu/op/conv/cortex-a/conv_dw_hcl_arm.c
@@ -50,8 +50,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* get cpu affinity */
     conv_priv_info->cpu_type = exec_graph->cpu_affinity;
@@ -67,7 +67,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
             return -1;
         }
     }
-        /* int8 prerun */
+    /* int8 prerun */
     else if (exec_graph->mode == TENGINE_MODE_INT8)
     {
         /* do prerun */
@@ -100,8 +100,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 run */
     if (exec_graph->mode == TENGINE_MODE_FP32)
@@ -114,7 +114,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         }
     }
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        else if (exec_graph->mode == TENGINE_MODE_FP16)
+    else if (exec_graph->mode == TENGINE_MODE_FP16)
     {
         if (conv_dw_fp16_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread, cpu_affinity) < 0)
         {
@@ -124,7 +124,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         }
     }
 #endif
-        /* int8 run */
+    /* int8 run */
     else if (exec_graph->mode == TENGINE_MODE_INT8)
     {
         if (conv_dw_int8_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, cpu_affinity) < 0)
@@ -145,7 +145,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 postrun */
     if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8)
@@ -157,7 +157,7 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc
             return -1;
         }
     }
-        /* int8 postrun */
+    /* int8 postrun */
     else if (exec_graph->mode == TENGINE_MODE_INT8)
     {
         if (conv_dw_int8_postrun(conv_priv_info) < 0)
@@ -171,17 +171,15 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc
     return 0;
 }
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     /* init the private info data of convolution op */
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info));
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info));
     if (conv_priv_info == NULL)
     {
-
         return -1;
     }
     memset(conv_priv_info, 0, sizeof(struct conv_priv_info));
@@ -191,7 +189,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
 
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
     sys_free(conv_priv_info);
     exec_node->ops_priv = NULL;
     return 0;
@@ -199,7 +197,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
-    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
     struct node* ir_node = exec_node;
     struct graph* ir_graph = ir_node->graph;
 
@@ -232,10 +230,10 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     if (input_tensor->data_type != TENGINE_DT_FP32 && input_tensor->data_type != TENGINE_DT_INT8)
         return 0;
 #endif
-    if (kernel_h == 7 && kernel_w == 7 && stride_h == 1 && stride_w == 1)    // this is a bug, todo fix it.
+    if (kernel_h == 7 && kernel_w == 7 && stride_h == 1 && stride_w == 1) // this is a bug, todo fix it.
         return 0;
 
-    if (kernel_h == 2 && kernel_w == 2)    // this is a bug, todo fix it.
+    if (kernel_h == 2 && kernel_w == 2) // this is a bug, todo fix it.
         return 0;
 
     if (dilation_h != 1 || dilation_w != 1)
@@ -248,13 +246,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops hcl_node_ops = {.prerun = prerun,
-        .run = run,
-        .reshape = NULL,
-        .postrun = postrun,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score
-};
+                                       .run = run,
+                                       .reshape = NULL,
+                                       .postrun = postrun,
+                                       .init_node = init_node,
+                                       .release_node = release_node,
+                                       .score = score};
 
 int register_conv_dw_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_k5_k7_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/conv_dw_k5_k7_kernel_arm.h
index c969d8d1f..7186b089c 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_dw_k5_k7_kernel_arm.h
+++ b/source/device/cpu/op/conv/cortex-a/conv_dw_k5_k7_kernel_arm.h
@@ -55,7 +55,7 @@ static float32x4_t vector_activation(float32x4_t tmp, int type)
         tmp = vmaxq_f32(tmp, zero);
         if (type > 0)
         {
-            float32x4_t max = vdupq_n_f32(( float )type);
+            float32x4_t max = vdupq_n_f32((float)type);
             tmp = vminq_f32(tmp, max);
         }
     }
@@ -66,7 +66,7 @@ static float32x4_t vector_activation(float32x4_t tmp, int type)
 void depthwise_conv_k5s1(float* input, float* weight, float* bias, float* output, int input_h, int input_w, int channel,
                          int output_h, int output_w, int activation, int num_thread)
 {
-// #pragma omp parallel for num_threads(num_thread)
+    // #pragma omp parallel for num_threads(num_thread)
     for (int c = 0; c < channel; c++)
     {
         float* input_cur = (float*)input + c * input_h * input_w;
@@ -91,7 +91,7 @@ void depthwise_conv_k5s2(float* input_buf, float* weight_buf, float* bias, float
     int mid_w = output_w - 2;
     int mid_w_block = mid_w & -4;
 
-// #pragma omp parallel for num_threads(num_thread)
+    // #pragma omp parallel for num_threads(num_thread)
     for (int c = 0; c < channel; c++)
     {
         int w, h;
@@ -685,16 +685,14 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_31_34);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_38_41);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_45_48);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             *output_buf++ = elem_activation(tmp0, activation);
             float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_17_20);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line2, kernel_24_27);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_31_34);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_38_41);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_45_48);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             *output_buf_1++ = elem_activation(tmp1, activation);
             float32x4_t tmp_4_2 = vmulq_f32(line1, kernel_10_13);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line2, kernel_17_20);
@@ -702,8 +700,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_31_34);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_38_41);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_45_48);
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             *output_buf_2++ = elem_activation(tmp2, activation);
         }
         float32x4_t kernel_9_12 = vextq_f32(kernel_8_11, kernel_12_15, 1);
@@ -716,8 +713,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_30_33);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_37_40);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_44_47);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             tmp0 += weight_buf[27] * input_1[4];
             tmp0 += weight_buf[34] * input_2[4];
             tmp0 += weight_buf[41] * input_3[4];
@@ -728,8 +724,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_30_33);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_37_40);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_44_47);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             tmp1 += weight_buf[20] * input_1[4];
             tmp1 += weight_buf[27] * input_2[4];
             tmp1 += weight_buf[34] * input_3[4];
@@ -742,8 +737,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_30_33);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_37_40);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_44_47);
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             tmp2 += weight_buf[13] * input_1[4];
             tmp2 += weight_buf[20] * input_2[4];
             tmp2 += weight_buf[27] * input_3[4];
@@ -1033,12 +1027,9 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_2 = vmlaq_f32(tmp_4_2, tmp, kernel_38_41);
             tmp = vextq_f32(zero, line6_1, 3);
             tmp_4_2 = vmlaq_f32(tmp_4_2, tmp, kernel_45_48);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             *output_buf++ = elem_activation(tmp0, activation);
             *output_buf_1++ = elem_activation(tmp1, activation);
             *output_buf_2++ = elem_activation(tmp2, activation);
@@ -1117,8 +1108,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_28_31);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_35_38);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_42_45);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[25];
             tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[32];
             tmp0 += vgetq_lane_f32(line3_1, 0) * weight_buf[39];
@@ -1130,8 +1120,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_28_31);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_35_38);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_42_45);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             tmp1 += vgetq_lane_f32(line1_1, 0) * weight_buf[18];
             tmp1 += vgetq_lane_f32(line2_1, 0) * weight_buf[25];
             tmp1 += vgetq_lane_f32(line3_1, 0) * weight_buf[32];
@@ -1145,8 +1134,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_28_31);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_35_38);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_42_45);
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             tmp2 += vgetq_lane_f32(line1_1, 0) * weight_buf[11];
             tmp2 += vgetq_lane_f32(line2_1, 0) * weight_buf[18];
             tmp2 += vgetq_lane_f32(line3_1, 0) * weight_buf[25];
@@ -1167,8 +1155,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_28_31);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_35_38);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_42_45);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             *output_buf++ = elem_activation(tmp0, activation);
 
             float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_14_17);
@@ -1176,8 +1163,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_28_31);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_35_38);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_42_45);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             *output_buf_1++ = elem_activation(tmp1, activation);
 
             float32x4_t tmp_4_2 = vmulq_f32(line1, kernel_7_10);
@@ -1186,8 +1172,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_28_31);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_35_38);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_42_45);
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             *output_buf_2++ = elem_activation(tmp2, activation);
         }
         float32x4_t kernel_1_4 = vextq_f32(kernel_0_3, kernel_4_7, 1);
@@ -1222,8 +1207,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_38_41);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_45_48);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
             }
             line1_1 = vld1q_f32(input_1 + 4);
@@ -1241,8 +1225,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_30_33);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_37_40);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_44_47);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
 
                 tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[6];
                 tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[13];
@@ -1424,8 +1407,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, tmp, kernel_38_41);
                 tmp = vextq_f32(zero, line7_1, 3);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, tmp, kernel_45_48);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
                 line1 = vextq_f32(line1, line1_1, 1);
                 line2 = vextq_f32(line2, line2_1, 1);
@@ -1483,8 +1465,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_42_45);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[4];
                 tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[11];
                 tmp0 += vgetq_lane_f32(line3_1, 0) * weight_buf[18];
@@ -1509,8 +1490,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_42_45);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
             }
         }
@@ -1536,23 +1516,20 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_24_27);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_38_41);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             *output_buf++ = elem_activation(tmp0, activation);
             float32x4_t tmp_4_1 = vmulq_f32(line2, kernel_3_6);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_10_13);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_17_20);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_24_27);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_31_34);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             *output_buf_1++ = elem_activation(tmp1, activation);
             float32x4_t tmp_4_2 = vmulq_f32(line3, kernel_3_6);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_10_13);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_17_20);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_24_27);
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             *output_buf_2++ = elem_activation(tmp2, activation);
         }
         line1_1 = vld1q_f32(input_1 + 4);
@@ -1569,8 +1546,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_23_26);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_30_33);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_37_40);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[6];
             tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[13];
             tmp0 += vgetq_lane_f32(line3_1, 0) * weight_buf[20];
@@ -1584,8 +1560,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_16_19);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_23_26);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_30_33);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             tmp1 += vgetq_lane_f32(line2_1, 0) * weight_buf[6];
             tmp1 += vgetq_lane_f32(line3_1, 0) * weight_buf[13];
             tmp1 += vgetq_lane_f32(line4_1, 0) * weight_buf[20];
@@ -1597,8 +1572,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_9_12);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_16_19);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_23_26);
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             tmp2 += vgetq_lane_f32(line3_1, 0) * weight_buf[6];
             tmp2 += vgetq_lane_f32(line4_1, 0) * weight_buf[13];
             tmp2 += vgetq_lane_f32(line5_1, 0) * weight_buf[20];
@@ -1871,14 +1845,11 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, tmp, kernel_38_41);
             tmp_4_1 = vmlaq_f32(tmp_4_1, tmp, kernel_31_34);
             tmp_4_2 = vmlaq_f32(tmp_4_2, tmp, kernel_24_27);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             *output_buf++ = elem_activation(tmp0, activation);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             *output_buf_1++ = elem_activation(tmp1, activation);
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             *output_buf_2++ = elem_activation(tmp2, activation);
             line1 = vextq_f32(line1, line1_1, 1);
             line2 = vextq_f32(line2, line2_1, 1);
@@ -1955,8 +1926,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_21_24);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             tmp0 += vgetq_lane_f32(line1_1, 0) * weight_buf[4];
             tmp0 += vgetq_lane_f32(line2_1, 0) * weight_buf[11];
             tmp0 += vgetq_lane_f32(line3_1, 0) * weight_buf[18];
@@ -1969,8 +1939,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_14_17);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_21_24);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_28_31);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             tmp1 += vgetq_lane_f32(line2_1, 0) * weight_buf[4];
             tmp1 += vgetq_lane_f32(line3_1, 0) * weight_buf[11];
             tmp1 += vgetq_lane_f32(line4_1, 0) * weight_buf[18];
@@ -1981,8 +1950,7 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_7_10);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_14_17);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_21_24);
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             tmp2 += vgetq_lane_f32(line3_1, 0) * weight_buf[4];
             tmp2 += vgetq_lane_f32(line4_1, 0) * weight_buf[11];
             tmp2 += vgetq_lane_f32(line5_1, 0) * weight_buf[18];
@@ -2003,23 +1971,20 @@ void depthwise_conv_k7s1(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_21_24);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             *output_buf++ = elem_activation(tmp0, activation);
             float32x4_t tmp_4_1 = vmulq_f32(line2, kernel_0_3);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line3, kernel_7_10);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_14_17);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_21_24);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_28_31);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             *output_buf_1++ = elem_activation(tmp1, activation);
             float32x4_t tmp_4_2 = vmulq_f32(line3, kernel_0_3);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line4, kernel_7_10);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line5, kernel_14_17);
             tmp_4_2 = vmlaq_f32(tmp_4_2, line6, kernel_21_24);
-            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) +
-                   vgetq_lane_f32(tmp_4_2, 3) + bias_c;
+            tmp2 = vgetq_lane_f32(tmp_4_2, 0) + vgetq_lane_f32(tmp_4_2, 1) + vgetq_lane_f32(tmp_4_2, 2) + vgetq_lane_f32(tmp_4_2, 3) + bias_c;
             *output_buf_2++ = elem_activation(tmp2, activation);
         }
     }
@@ -2041,7 +2006,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
     int mid_block = mid_w >> 2;
     int w = 0;
 
-//#pragma omp parallel for num_threads(num_thread)
+    //#pragma omp parallel for num_threads(num_thread)
     for (int c = 0; c < channel; c++)
     {
         float tmp0, tmp1;
@@ -2086,8 +2051,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_31_34);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_38_41);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_45_48);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             *output_buf++ = elem_activation(tmp0, activation);
             float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_10_13);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line2, kernel_17_20);
@@ -2095,8 +2059,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_31_34);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_38_41);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_45_48);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             *output_buf_1++ = elem_activation(tmp1, activation);
         }
 
@@ -2331,8 +2294,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
             tmp_4_0 = vmlaq_f32(tmp_4_0, line2_1, kernel_31_34);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line3_1, kernel_38_41);
             tmp_4_0 = vmlaq_f32(tmp_4_0, line4_1, kernel_45_48);
-            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                   vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+            tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
             *output_buf++ = elem_activation(tmp0, activation);
             float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_0789);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line2, kernel_0141516);
@@ -2346,8 +2308,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
             tmp_4_1 = vmlaq_f32(tmp_4_1, line4_1, kernel_31_34);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line5_1, kernel_38_41);
             tmp_4_1 = vmlaq_f32(tmp_4_1, line6_1, kernel_45_48);
-            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                   vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+            tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
             *output_buf_1++ = elem_activation(tmp1, activation);
 
             line1 = vextq_f32(line1, line1_1, 2);
@@ -2423,8 +2384,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line2, kernel_28_31);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_35_38);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_42_45);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
                 float32x4_t tmp_4_1 = vmulq_f32(line1, kernel_7_10);
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line2, kernel_14_17);
@@ -2432,8 +2392,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_28_31);
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_35_38);
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_42_45);
-                tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                       vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+                tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
                 *output_buf_1++ = elem_activation(tmp1, activation);
             }
         }
@@ -2497,8 +2456,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_38_41);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_45_48);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
             }
             line1_1 = vld1q_f32(input_1 + 4);
@@ -2702,8 +2660,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5_1, kernel_31_34);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line6_1, kernel_38_41);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line7_1, kernel_45_48);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
 
                 line1 = vextq_f32(line1, line1_1, 2);
@@ -2774,8 +2731,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line7, kernel_42_45);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
             }
             else
@@ -2824,15 +2780,13 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_24_27);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_38_41);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
                 float32x4_t tmp_4_1 = vmulq_f32(line3, kernel_3_6);
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_10_13);
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_17_20);
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_24_27);
-                tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                       vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+                tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
                 *output_buf_1++ = elem_activation(tmp1, activation);
             }
             line1_1 = vld1q_f32(input_1 + 4);
@@ -3052,8 +3006,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line4_1, kernel_24_27);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5_1, kernel_31_34);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line6_1, kernel_38_41);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
 
                 float32x4_t tmp_4_1 = vmulq_f32(line3, kernel_0012);
@@ -3064,8 +3017,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line4_1, kernel_10_13);
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line5_1, kernel_17_20);
                 tmp_4_1 = vmlaq_f32(tmp_4_1, line6_1, kernel_24_27);
-                tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                       vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+                tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
                 *output_buf_1++ = elem_activation(tmp1, activation);
 
                 line1 = vextq_f32(line1, line1_1, 2);
@@ -3143,15 +3095,13 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                     tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_21_24);
                     tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31);
                     tmp_4_0 = vmlaq_f32(tmp_4_0, line6, kernel_35_38);
-                    tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                           vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                    tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                     *output_buf++ = elem_activation(tmp0, activation);
                     float32x4_t tmp_4_1 = vmulq_f32(line3, kernel_0_3);
                     tmp_4_1 = vmlaq_f32(tmp_4_1, line4, kernel_7_10);
                     tmp_4_1 = vmlaq_f32(tmp_4_1, line5, kernel_14_17);
                     tmp_4_1 = vmlaq_f32(tmp_4_1, line6, kernel_21_24);
-                    tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) +
-                           vgetq_lane_f32(tmp_4_1, 3) + bias_c;
+                    tmp1 = vgetq_lane_f32(tmp_4_1, 0) + vgetq_lane_f32(tmp_4_1, 1) + vgetq_lane_f32(tmp_4_1, 2) + vgetq_lane_f32(tmp_4_1, 3) + bias_c;
                     *output_buf_1++ = elem_activation(tmp1, activation);
                 }
             }
@@ -3205,8 +3155,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_17_20);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_24_27);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_31_34);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
             }
             line1_1 = vld1q_f32(input_1 + 4);
@@ -3356,8 +3305,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line3_1, kernel_17_20);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line4_1, kernel_24_27);
                 tmp_4_0 = vmlaq_f32(tmp_4_0, line5_1, kernel_31_34);
-                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                       vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                 *output_buf++ = elem_activation(tmp0, activation);
 
                 line1 = vextq_f32(line1, line1_1, 2);
@@ -3414,8 +3362,7 @@ void depthwise_conv_k7s2(float* input, float* weight, float* bias, float* output
                     tmp_4_0 = vmlaq_f32(tmp_4_0, line3, kernel_14_17);
                     tmp_4_0 = vmlaq_f32(tmp_4_0, line4, kernel_21_24);
                     tmp_4_0 = vmlaq_f32(tmp_4_0, line5, kernel_28_31);
-                    tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) +
-                           vgetq_lane_f32(tmp_4_0, 3) + bias_c;
+                    tmp0 = vgetq_lane_f32(tmp_4_0, 0) + vgetq_lane_f32(tmp_4_0, 1) + vgetq_lane_f32(tmp_4_0, 2) + vgetq_lane_f32(tmp_4_0, 3) + bias_c;
                     *output_buf++ = elem_activation(tmp0, activation);
                 }
             }
diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.c b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.c
index 7ed499544..3ee41e0bb 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.c
+++ b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.c
@@ -36,7 +36,6 @@
 #include "utility/log.h"
 #include "device/cpu/cpu_node.h"
 
-
 static void pad_0_align_2D(float* dst, float* src, int m, int n, int m_align, int n_align, int pad_h, int pad_w)
 {
     int i;
@@ -220,7 +219,7 @@ static void DirectConv(float* input_buf, int input_h, int input_w, float* output
 #endif
 
 int conv_dw_prerun(struct tensor* input_tensor, struct tensor* filter_tensor,
-                         struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param)
+                   struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param)
 {
     int batch = input_tensor->dims[0];
     int input_c = input_tensor->dims[1];
@@ -237,7 +236,7 @@ int conv_dw_prerun(struct tensor* input_tensor, struct tensor* filter_tensor,
 
     priv_info->input_pad = sys_malloc(batch * input_c * padded_in_h * padded_in_w * sizeof(float));
     memset(priv_info->input_pad, 0, batch * input_c * padded_in_h * padded_in_w * sizeof(float));
- 
+
     return 0;
 }
 
@@ -277,14 +276,14 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struc
     int padded_in_w = in_w + param->pad_w0 + param->pad_w1;
 
     /* buffer addr */
-    float* input_buf = ( float* )input_tensor->data;
-    float* kernel_buf = ( float* )filter_tensor->data;
-    float* output_buf = ( float* )output_tensor->data;
+    float* input_buf = (float*)input_tensor->data;
+    float* kernel_buf = (float*)filter_tensor->data;
+    float* output_buf = (float*)output_tensor->data;
     float* biases_buf = NULL;
     if (bias_tensor)
-        biases_buf = ( float* )bias_tensor->data;
+        biases_buf = (float*)bias_tensor->data;
 
-    for (int n = 0; n < batch; n++)    // batch size
+    for (int n = 0; n < batch; n++) // batch size
     {
         float* cur_input = input_buf + n * input_size * group;
         float* cur_output = output_buf + n * output_size * group;
@@ -304,7 +303,7 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struc
             if (stride_h == 1)
             {
                 pad_0_align_3D((float*)conv_info->input_pad + n * group * padded_in_h * padded_in_w, cur_input,
-                           in_h, in_w, padded_in_h, padded_in_w, group, param->pad_h0, param->pad_w0);
+                               in_h, in_w, padded_in_h, padded_in_w, group, param->pad_h0, param->pad_w0);
                 depthwise_conv_k5s1((float*)conv_info->input_pad, kernel_buf, biases_buf, cur_output, padded_in_h, padded_in_w, group, out_h, out_w,
                                     act_type, num_thread);
             }
diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.h
index 17030f3cd..0e53cb03b 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.h
+++ b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_arm.h
@@ -30,21 +30,18 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 /* float32 */
 int conv_dw_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
-                    struct conv_priv_info* info, struct conv_param* param);
+                   struct conv_priv_info* info, struct conv_param* param);
 int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity)
-   ;
+                struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity);
 int conv_dw_postrun(struct conv_priv_info* priv_info);
 
 /* int8 */
 int conv_dw_int8_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
-                    struct conv_priv_info* info, struct conv_param* param);
+                        struct conv_priv_info* info, struct conv_param* param);
 int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                 struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
-                 int num_thread, int cpu_affinity)
-   ;
+                     struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
+                     int num_thread, int cpu_affinity);
 int conv_dw_int8_postrun(struct conv_priv_info* priv_info);
 #endif
diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.c b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.c
index ffbb6eb1d..21b7e583e 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.c
+++ b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.c
@@ -32,21 +32,20 @@
 
 #include "utility/sys_port.h"
 
-
 #ifdef __aarch64__
 void depthwise_k3s1p1_int8_a72(int8_t* input, int8_t* kernel, int8_t* out, int* bias, long out_h, long out_w,
-                                        long multi, long shift, long input_w, long act_min, long act_max);
+                               long multi, long shift, long input_w, long act_min, long act_max);
 void depthwise_k3s2p1_int8_a72(int8_t* input, int8_t* kernel, int8_t* out, int* bias, long out_h, long out_w,
-                                        long multi, long shift, long input_w, long act_min, long act_max);
+                               long multi, long shift, long input_w, long act_min, long act_max);
 #else
 void depthwise_k3s1_int8(int8_t* input, int8_t* kernel, int8_t* out, int* bias, int out_h, int out_w,
-                             int multi, int shift, int input_w, int act_min, int act_max);
+                         int multi, int shift, int input_w, int act_min, int act_max);
 void depthwise_k3s2_int8(int8_t* input, int8_t* kernel, int8_t* out, int* bias, int out_h, int out_w,
-                                        int multi, int shift, int input_w, int act_min, int act_max);
+                         int multi, int shift, int input_w, int act_min, int act_max);
 #endif
 
 int conv_dw_int8_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
-                    struct conv_priv_info* priv_info, struct conv_param* param)
+                        struct conv_priv_info* priv_info, struct conv_param* param)
 {
     int batch = input_tensor->dims[0];
     int in_c = input_tensor->dims[1];
@@ -67,19 +66,19 @@ int conv_dw_int8_prerun(struct tensor* input_tensor, struct tensor* filter_tenso
     priv_info->activation_min = -127;
     priv_info->activation_max = 127;
     /*  set activation   */
-    if(param->activation >= 0)
+    if (param->activation >= 0)
     {
         priv_info->activation_min = 0;
-        if(param->activation == 1)
+        if (param->activation == 1)
             priv_info->activation_max = round(1.0 / output_scale);
-        if(param->activation == 6)
+        if (param->activation == 6)
             priv_info->activation_max = round(6.0 / output_scale);
 
-        if(priv_info->activation_max > 127)
+        if (priv_info->activation_max > 127)
             priv_info->activation_max = 127;
     }
 
-    for(int i=0; i<out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
         float kernel_scale = kernel_scales[i];
         float scale = input_scale * kernel_scale / output_scale;
@@ -88,7 +87,7 @@ int conv_dw_int8_prerun(struct tensor* input_tensor, struct tensor* filter_tenso
         float q = frexp(scale, &shift);
         int fix_q = round(q * (1ll << 31));
         // TLOG_ERR("prerun: %f,%lld,%d,%d, %lld\n",q, fix_q, multi, q_shift, 1ll<<31);
-        if(fix_q == (1l << 31))
+        if (fix_q == (1l << 31))
         {
             fix_q /= 2;
             shift++;
@@ -117,8 +116,8 @@ int conv_dw_int8_postrun(struct conv_priv_info* priv_info)
 }
 
 void conv_dw_int8_direct(int8_t* input_buf, int8_t* weight_buf, int8_t* output_buf, int* bias, int input_h, int input_w,
-                    int output_h, int output_w, int channel_num, int stride, int* pads, int* p_multi, int* p_shift, 
-                    int activation_min, int activation_max, int num_thread, int cpu_affinity)
+                         int output_h, int output_w, int channel_num, int stride, int* pads, int* p_multi, int* p_shift,
+                         int activation_min, int activation_max, int num_thread, int cpu_affinity)
 {
     int channel_size = input_h * input_w;
 #ifndef __aarch64__
@@ -126,24 +125,24 @@ void conv_dw_int8_direct(int8_t* input_buf, int8_t* weight_buf, int8_t* output_b
     int input_h_pad = input_h + pads[0] + pads[2];
     int input_w_pad = input_w + pads[1] + pads[3];
     int is_pad0 = (pads[0] == 0 && pads[1] == 0 && pads[2] == 0 && pads[3] == 0);
-    if(!is_pad0)
+    if (!is_pad0)
     {
         input_pad = (int8_t*)malloc(sizeof(int8_t) * channel_num * input_h_pad * input_w_pad + 128);
         memset(input_pad, 0, sizeof(int8_t) * channel_num * input_h_pad * input_w_pad + 128);
     }
 #endif
 #pragma omp parallel for num_threads(num_thread)
-    for(int i = 0; i < channel_num; i++)
+    for (int i = 0; i < channel_num; i++)
     {
         int8_t* input_tmp = NULL;
         int* bias_tmp = bias ? (bias + i) : NULL;
 #ifndef __aarch64__
-        if(!is_pad0)
+        if (!is_pad0)
         {
             int8_t* tmp = input_pad + i * input_h_pad * input_w_pad;
             input_tmp = tmp;
             tmp += pads[0] * input_w_pad + pads[1];
-            for(int j = 0; j < input_h; j ++)
+            for (int j = 0; j < input_h; j++)
             {
                 memcpy(tmp, input_buf + i * channel_size + j * input_w, input_w);
                 tmp += input_w_pad;
@@ -154,29 +153,29 @@ void conv_dw_int8_direct(int8_t* input_buf, int8_t* weight_buf, int8_t* output_b
         {
             input_tmp = input_buf + i * channel_size;
         }
-        if(1 == stride)
+        if (1 == stride)
         {
 #ifdef __aarch64__
             depthwise_k3s1p1_int8_a72(input_tmp, weight_buf + 9 * i, output_buf + i * output_h * output_w, bias_tmp, output_h, output_w,
-                                    p_multi[i], p_shift[i], input_w, activation_min, activation_max);
+                                      p_multi[i], p_shift[i], input_w, activation_min, activation_max);
 #else
             depthwise_k3s1_int8(input_tmp, weight_buf + 9 * i, output_buf + i * output_h * output_w, bias_tmp, output_h, output_w,
-                                    p_multi[i], p_shift[i], input_w_pad, activation_min, activation_max);
+                                p_multi[i], p_shift[i], input_w_pad, activation_min, activation_max);
 #endif
         }
-        else if(2 == stride)
+        else if (2 == stride)
         {
 #ifdef __aarch64__
             depthwise_k3s2p1_int8_a72(input_tmp, weight_buf + 9 * i, output_buf + i * output_h * output_w, bias_tmp, output_h, output_w,
-                                    p_multi[i], p_shift[i], input_w, activation_min, activation_max);
+                                      p_multi[i], p_shift[i], input_w, activation_min, activation_max);
 #else
             depthwise_k3s2_int8(input_tmp, weight_buf + 9 * i, output_buf + i * output_h * output_w, bias_tmp, output_h, output_w,
-                                    p_multi[i], p_shift[i], input_w_pad, activation_min, activation_max);
+                                p_multi[i], p_shift[i], input_w_pad, activation_min, activation_max);
 #endif
         }
     }
 #ifndef __aarch64__
-    if(!is_pad0)
+    if (!is_pad0)
     {
         free(input_pad);
         input_pad = NULL;
@@ -185,8 +184,8 @@ void conv_dw_int8_direct(int8_t* input_buf, int8_t* weight_buf, int8_t* output_b
 }
 
 int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                 struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
-                 int num_thread, int cpu_affinity)
+                     struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
+                     int num_thread, int cpu_affinity)
 {
     /* param */
     int pads[4] = {0};
@@ -200,8 +199,8 @@ int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor,
     int act_type = param->activation;
     pads[0] = param->pad_h0;
     pads[1] = param->pad_w0;
-    pads[2] = param->pad_h1;    
-    pads[3] = param->pad_w1;    
+    pads[2] = param->pad_h1;
+    pads[3] = param->pad_w1;
 
     int batch = input_tensor->dims[0];
     int in_c = input_tensor->dims[1] / group;
@@ -223,9 +222,9 @@ int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor,
     int activation_max = priv_info->activation_max;
 
     /* buffer addr */
-    int8_t* input_buf = ( int8_t* )input_tensor->data;
-    int8_t* kernel_buf = ( int8_t* )filter_tensor->data;
-    int8_t* output_buf = ( int8_t* )output_tensor->data;
+    int8_t* input_buf = (int8_t*)input_tensor->data;
+    int8_t* kernel_buf = (int8_t*)filter_tensor->data;
+    int8_t* output_buf = (int8_t*)output_tensor->data;
     int32_t* biases_buf = NULL;
     if (bias_tensor != NULL)
     {
@@ -234,13 +233,13 @@ int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor,
 
     int* multi = priv_info->multi;
     int* q_shift = priv_info->q_shift;
-    for (int n = 0; n < batch; n++)    // batch size
+    for (int n = 0; n < batch; n++) // batch size
     {
         int8_t* input = input_buf + n * input_size * group;
         int8_t* kernel = kernel_buf + n * kernel_size * group;
         int8_t* output = output_buf + n * output_size * group;
-        conv_dw_int8_direct(input, kernel, output, biases_buf, in_h, in_w, 
-                            out_h, out_w, in_c * group, stride_h, pads, multi, q_shift, 
+        conv_dw_int8_direct(input, kernel, output, biases_buf, in_h, in_w,
+                            out_h, out_w, in_c * group, stride_h, pads, multi, q_shift,
                             activation_min, activation_max, num_thread, cpu_affinity);
     }
     return 0;
diff --git a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.h b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.h
index 52bdc89aa..d5d685c6d 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.h
+++ b/source/device/cpu/op/conv/cortex-a/conv_dw_kernel_int8_arm.h
@@ -32,12 +32,11 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int conv_dw_int8_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
-                    struct conv_priv_info* priv_info, struct conv_param* param);
+                        struct conv_priv_info* priv_info, struct conv_param* param);
 int conv_dw_int8_postrun(struct conv_priv_info* priv_info);
 int conv_dw_int8_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                 struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
-                 int num_thread, int cpu_affinity);
+                     struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
+                     int num_thread, int cpu_affinity);
 
 #endif
\ No newline at end of file
diff --git a/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c b/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c
index 977c8d4e2..5958c7c38 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c
+++ b/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c
@@ -40,7 +40,6 @@
 
 #include <string.h>
 
-
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -49,8 +48,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* get cpu affinity */
     conv_priv_info->cpu_type = exec_graph->cpu_affinity;
@@ -69,7 +68,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
         if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size)
         {
             if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem,
-                                              exec_graph->shared_pack4_mem_size) < 0)
+                                              exec_graph->shared_pack4_mem_size)
+                < 0)
             {
                 TLOG_ERR("hcl conv: set shared pack4 memory failed\n");
                 return -1;
@@ -148,14 +148,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (ir_node->input_num > 2)
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 run */
     if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8)
     {
         if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread,
-                         cpu_affinity) < 0)
+                         cpu_affinity)
+            < 0)
         {
             TLOG_ERR("hcl conv run failed\n");
             return -1;
@@ -177,7 +178,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     else if (exec_graph->mode == TENGINE_MODE_INT8)
     {
         if (int8_conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread,
-                         cpu_affinity) < 0)
+                              cpu_affinity)
+            < 0)
         {
             TLOG_ERR("hcl conv int8 run failed\n");
             return -1;
@@ -201,7 +203,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     /* dynamic get the shape of output tensor */
     int n = input_tensor->dims[0];
@@ -263,10 +265,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     }
     else
     {
-        out_h =
-            (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) /
-            conv_param->stride_h +
-            1;
+        out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / conv_param->stride_h + 1;
     }
 
     if (conv_param->pad_w0 < 0)
@@ -289,10 +288,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     }
     else
     {
-        out_w =
-            (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) /
-            conv_param->stride_w +
-            1;
+        out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / conv_param->stride_w + 1;
     }
 
     int dims[4];
@@ -305,7 +301,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
             dims[2] = out_h;
             dims[3] = out_w;
 
-            for (int i=0; i<4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 if (dims[i] == 0)
                     dims[i] = 1;
@@ -322,7 +318,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
             dims[2] = out_w;
             dims[3] = out_c;
 
-            for (int i=0; i<4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 if (dims[i] == 0)
                     dims[i] = 1;
@@ -337,7 +333,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 postrun */
     if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8)
@@ -390,10 +386,10 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
     filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     /* init the private info data of convolution op */
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info));
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info));
     if (conv_priv_info == NULL)
     {
         return -1;
@@ -429,7 +425,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
 
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
     sys_free(conv_priv_info);
     exec_node->ops_priv = NULL;
 
@@ -442,7 +438,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
     int group = param->group;
     int kernel_h = param->kernel_h;
     int kernel_w = param->kernel_w;
@@ -450,7 +446,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     int out_c = output_tensor->dims[1] / group;
 
     /* todo support int8/fp16 */
-#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC    
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     if (input_tensor->data_type != TENGINE_DT_FP32 && input_tensor->data_type != TENGINE_DT_FP16)
         return 0;
 
@@ -466,14 +462,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops hcl_node_ops = {
-        .prerun = prerun,
-        .run = run,
-        .reshape = reshape,
-        .postrun = postrun,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score
-};
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score};
 
 int register_conv_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.c b/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.c
index 9e91564d9..dc10dec4c 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.c
+++ b/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.c
@@ -118,9 +118,9 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern
 /* kernel interleave */
 static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param)
 {
-    int group       = param->group;
+    int group = param->group;
     int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3];
-    int out_chan    = filter->dims[0] / group;
+    int out_chan = filter->dims[0] / group;
     int out_chan_align4 = (out_chan + 3) / 4 * 4;
 
     int kernel_size_algin = kernel_size * out_chan_align4;
@@ -130,7 +130,7 @@ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info,
     float* interleave_buf = priv_info->interleave_buffer;
     for (int g = 0; g < group; g++)
     {
-        float* cur_kernel     = kernel + g * kernel_size_group;
+        float* cur_kernel = kernel + g * kernel_size_group;
         float* cur_interleave = interleave_buf + g * kernel_size_algin;
         interleave_kernel(cur_kernel, cur_interleave, out_chan, kernel_size);
     }
@@ -145,7 +145,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k
         int in_xy = in_w * in_h;
         int out_xy = out_w * out_h;
         int col_end3 = out_xy & 3;
-        #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
         for (int col_i = 0; col_i < out_xy - 3; col_i += 4)
         {
             float* cur_col = col + col_i * kernel_size;
@@ -179,7 +179,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k
         int out_xy = out_w * out_h;
         int col_end3 = out_xy & 3;
         int is_pad0 = (pad_w0 == 0) && (pad_h0 == 0) && (pad_w1 == 0) && (pad_h1 == 0);
-        #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
         for (int col_i = 0; col_i < (out_xy & -4); col_i += 4)
         {
             float* cur_col = col + col_i * kernel_size;
@@ -255,7 +255,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k
     else
     {
         int out_xy = out_w * out_h;
-        #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
         for (int col_i = 0; col_i < out_xy - 3; col_i += 4)
         {
             int kernel_size = k_w * k_h * in_c;
@@ -318,7 +318,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k
 }
 
 static void sgemm_set(float* col, float* kernel, float* biases, float* output, int kernel_size, int col_start,
-                        int col_end, int kernel_start, int kernel_end, int output_xy, int activation, int num_thread, int cpu_affinity)
+                      int col_end, int kernel_start, int kernel_end, int output_xy, int activation, int num_thread, int cpu_affinity)
 {
     int col_end3 = col_end & 0x3;
     int nn_outch = kernel_end / PER_OUT_CHAN;
@@ -327,21 +327,21 @@ static void sgemm_set(float* col, float* kernel, float* biases, float* output, i
     for (int pp = 0; pp < nn_outch; pp++)
     {
         int p = pp * PER_OUT_CHAN;
-        float* biasptr = biases ? ( float* )(biases + p) : NULL;
-        float* kernel_tmp = ( float* )(kernel + p * kernel_size);
-        float* output_tmp = ( float* )(output + p * output_xy);
+        float* biasptr = biases ? (float*)(biases + p) : NULL;
+        float* kernel_tmp = (float*)(kernel + p * kernel_size);
+        float* output_tmp = (float*)(output + p * output_xy);
 
         for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
 #ifdef __aarch64__
         {
-            float* col_tmp = ( float* )(col + col_line * kernel_size);
+            float* col_tmp = (float*)(col + col_line * kernel_size);
             sgemm_4x16_a72(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0);
         }
         if (col_end3)
         {
             int col_line = col_end & -4;
             float result[4 * PER_OUT_CHAN];
-            float* col_tmp = ( float* )(col + col_line * kernel_size);
+            float* col_tmp = (float*)(col + col_line * kernel_size);
 
             sgemm_4x16_a72(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0);
 
@@ -355,14 +355,14 @@ static void sgemm_set(float* col, float* kernel, float* biases, float* output, i
         }
 #else
         {
-            float* col_tmp = ( float* )(col + col_line * kernel_size);
+            float* col_tmp = (float*)(col + col_line * kernel_size);
             sgemm_4x12_a17(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0);
         }
         if (col_end3)
         {
             int col_line = col_end & -4;
             float result[4 * PER_OUT_CHAN];
-            float* col_tmp = ( float* )(col + col_line * kernel_size);
+            float* col_tmp = (float*)(col + col_line * kernel_size);
 
             sgemm_4x12_a17(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0);
 
@@ -385,16 +385,16 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in
     int kernel_end3 = kernel_end & 0x3;
 
 #pragma omp parallel for num_threads(num_thread)
-    for (int kernel_num = (kernel_start & -4); kernel_num  < (kernel_end & -4); kernel_num += 4)
+    for (int kernel_num = (kernel_start & -4); kernel_num < (kernel_end & -4); kernel_num += 4)
     {
         float *cur_col, *cur_kernel, *cur_output;
-        float* cur_biases = biases ? ( float* )(biases + kernel_num) : NULL;
+        float* cur_biases = biases ? (float*)(biases + kernel_num) : NULL;
 
-        cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
-        cur_output = ( float* )(output + kernel_num * output_xy);
+        cur_kernel = (float*)(kernel + kernel_num * kernel_size);
+        cur_output = (float*)(output + kernel_num * output_xy);
         for (int col_line = 0; col_line < (col_end & -4); col_line += 4)
         {
-            cur_col = ( float* )(col + col_line * kernel_size);
+            cur_col = (float*)(col + col_line * kernel_size);
 #ifdef __aarch64__
             sgemm_4x4_a72(cur_biases, cur_col, cur_kernel, kernel_size, cur_output + col_line, output_xy, activation, 0);
 #else
@@ -405,7 +405,7 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in
         {
             float result[16];
             int col_line = col_end & -4;
-            cur_col = ( float* )(col + col_line * kernel_size);
+            cur_col = (float*)(col + col_line * kernel_size);
 #ifdef __aarch64__
             sgemm_4x4_a72(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
 #else
@@ -421,14 +421,14 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in
     if (kernel_end3)
     {
         int kernel_num = (kernel_end & -4);
-        float* cur_biases = biases ? ( float* )(biases + kernel_num) : NULL;
-        float* cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
+        float* cur_biases = biases ? (float*)(biases + kernel_num) : NULL;
+        float* cur_kernel = (float*)(kernel + kernel_num * kernel_size);
 
 #pragma omp parallel for num_threads(num_thread)
         for (int col_line = 0; col_line < (col_end & -4); col_line += 4)
         {
             float result[16];
-            float* cur_col = ( float* )(col + col_line * kernel_size);
+            float* cur_col = (float*)(col + col_line * kernel_size);
 #ifdef __aarch64__
             sgemm_4x4_a72(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
 #else
@@ -443,7 +443,7 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in
         {
             float result[16];
             int col_line = col_end & -4;
-            float* cur_col = ( float* )(col + col_line * kernel_size);
+            float* cur_col = (float*)(col + col_line * kernel_size);
 #ifdef __aarch64__
             sgemm_4x4_a72(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
 #else
@@ -487,15 +487,15 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w)
  */
 int conv_hcl_get_shared_mem_size(struct tensor* input, struct tensor* output, struct conv_param* param)
 {
-    int in_h  = input->dims[2];
-    int in_w  = input->dims[3];
+    int in_h = input->dims[2];
+    int in_w = input->dims[3];
     int out_h = output->dims[2];
     int out_w = output->dims[3];
     int group = param->group;
-    int input_chan  = param->input_channel / group;
+    int input_chan = param->input_channel / group;
     int kernel_size = input_chan * param->kernel_h * param->kernel_w;
-    int out_cstep   = out_h * out_w;      // channel cstep, output_h * output_w
-    int elem_size   = input->elem_size;   // uint8/int8 is 1 byte, fp32 is 4 bytes
+    int out_cstep = out_h * out_w;    // channel cstep, output_h * output_w
+    int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes
 
     out_cstep = (out_cstep + 3) / 4 * 4;
     int mem_size = elem_size * kernel_size * out_cstep + 128;
@@ -512,7 +512,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param)
     int out_chan = filter->dims[0] / group;
     int out_chan_align4 = (out_chan + 3) / 4 * 4;
     int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3];
-    int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128;    // caution
+    int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution
 
     return mem_size;
 }
@@ -552,7 +552,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s
     if (priv_info->winograd)
     {
 #ifdef __aarch64__
-        if(in_c >= 256)
+        if (in_c >= 256)
             return wino_conv_hcl_prerun_1(input_tensor, filter_tensor, output_tensor, priv_info, param);
         else
 #endif
@@ -564,7 +564,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s
     {
         int mem_size = conv_hcl_get_shared_mem_size(input_tensor, output_tensor, param);
         void* mem = sys_malloc(mem_size);
-        priv_info->im2col_buffer      = mem;
+        priv_info->im2col_buffer = mem;
         priv_info->im2col_buffer_size = mem_size;
     }
 
@@ -573,7 +573,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s
     {
         int mem_size = get_private_mem_size(filter_tensor, param);
         void* mem = sys_malloc(mem_size);
-        priv_info->interleave_buffer      = mem;
+        priv_info->interleave_buffer = mem;
         priv_info->interleave_buffer_size = mem_size;
     }
 
@@ -634,7 +634,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru
     if (priv_info->winograd)
     {
 #ifdef __aarch64__
-        if(in_c >= 256)
+        if (in_c >= 256)
             return wino_conv_hcl_run_1(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, num_thread, cpu_affinity);
         else
 #endif
@@ -650,13 +650,13 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru
     int output_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3];
 
     /* buffer addr */
-    float* input_buf = ( float* )input_tensor->data;
-    float* output_buf = ( float* )output_tensor->data;
+    float* input_buf = (float*)input_tensor->data;
+    float* output_buf = (float*)output_tensor->data;
     float* biases_buf = NULL;
     if (bias_tensor != NULL)
-        biases_buf = ( float* )bias_tensor->data;
-    float* col_buf = ( float* )priv_info->im2col_buffer;
-    float* interleave_buf = ( float* )priv_info->interleave_buffer;
+        biases_buf = (float*)bias_tensor->data;
+    float* col_buf = (float*)priv_info->im2col_buffer;
+    float* interleave_buf = (float*)priv_info->interleave_buffer;
 
     /* block size split parameter */
     int L2_CACHE_SIZE = ((cpu_affinity == TENGINE_CLUSTER_LITTLE) ? 512 : 1024) * 1024;
@@ -666,7 +666,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru
     int sgemm_set_chan = out_c / PER_OUT_CHAN * PER_OUT_CHAN;
     int sgemm_set_remain = out_c % PER_OUT_CHAN;
 
-    for (int n = 0; n < batch; n++)    // batch size
+    for (int n = 0; n < batch; n++) // batch size
     {
         for (int g = 0; g < group; g++)
         {
@@ -677,19 +677,19 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru
 
             /* im2col */
             im2col(cur_input, col_buf, in_c, in_w, in_h, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h,
-                pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread);
+                   pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread);
 
-            for(int col_i = 0; col_i < out_hw; col_i += col_cnt_l2)
+            for (int col_i = 0; col_i < out_hw; col_i += col_cnt_l2)
             {
                 int col_start = col_i;
                 int col_end = col_i + col_cnt_l2;
                 col_end = col_end > out_hw ? out_hw : col_end;
                 /* gemm */
                 sgemm_set(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, col_start, col_end, 0, sgemm_set_chan, out_hw, act_type,
-                        num_thread, cpu_affinity);
+                          num_thread, cpu_affinity);
                 if (sgemm_set_remain)
                     sgemm4x4(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, col_start, col_end, sgemm_set_chan, out_c, out_hw,
-                            act_type, num_thread, cpu_affinity);
+                             act_type, num_thread, cpu_affinity);
             }
         }
     }
diff --git a/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.h
index 1c489e0c6..041b4980e 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.h
+++ b/source/device/cpu/op/conv/cortex-a/conv_kernel_arm.h
@@ -31,7 +31,6 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 /* float32 */
 int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
                     struct conv_priv_info* info, struct conv_param* param);
@@ -52,29 +51,29 @@ int conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, i
 
 /* fp16 */
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-int fp16_conv_hcl_prerun(struct tensor*  input_tensor,
-                    struct tensor*  filter_tensor,
-                    struct tensor*  output_tensor,
-                    struct conv_priv_info* info,     
-                    struct conv_param* param) ;
+int fp16_conv_hcl_prerun(struct tensor* input_tensor,
+                         struct tensor* filter_tensor,
+                         struct tensor* output_tensor,
+                         struct conv_priv_info* info,
+                         struct conv_param* param);
 
 int fp16_conv_hcl_postrun(struct conv_priv_info* info);
 
-int fp16_conv_hcl_run(struct tensor* input_tensor , struct tensor* filter_tensor ,struct tensor* bias_tensor ,  struct tensor* output_tensor , struct conv_priv_info*  conv_info ,struct conv_param* param, int num_thread, int cpu_affinity) ;
+int fp16_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity);
 
-int fp16_conv_hcl_get_shared_mem_size(struct tensor*  input_tensor ,struct tensor*  output_tensor , struct conv_param* param) ;
+int fp16_conv_hcl_get_shared_mem_size(struct tensor* input_tensor, struct tensor* output_tensor, struct conv_param* param);
 #endif
 
 /* int8 */
 int int8_conv_hcl_get_shared_mem_size(struct tensor* input_tensor, struct tensor* output_tensor,
-                                 struct conv_param* param);
+                                      struct conv_param* param);
 int int8_conv_hcl_set_shared_mem(struct conv_priv_info* priv_info, void* mem, int mem_size);
 int int8_conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, int mem_size);
 
 int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
-                    struct conv_priv_info* priv_info, struct conv_param* param);
+                         struct conv_priv_info* priv_info, struct conv_param* param);
 int int8_conv_hcl_postrun(struct conv_priv_info* priv_info);
 int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                 struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
-                 int num_thread, int cpu_affinity);
+                      struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
+                      int num_thread, int cpu_affinity);
 #endif
diff --git a/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.c b/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.c
index f96b8b5d6..6c17a77ac 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.c
+++ b/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.c
@@ -32,12 +32,11 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-
 #ifdef __aarch64__
 void i8gemm_4x16_a72_int8(int* biases, int8_t* input, int8_t* kernel, long kernel_size, int8_t* output,
-                                int* multi, long output_xy, int* shift, int activation_min, int activation_max);
+                          int* multi, long output_xy, int* shift, int activation_min, int activation_max);
 void i8gemm_4x4_a72_int8(int* biases, int8_t* input, int8_t* kernel, long kernel_size, int8_t* output,
-                                int* multi, long output_xy, int* shift, int activation_min, int activation_max);
+                         int* multi, long output_xy, int* shift, int activation_min, int activation_max);
 void im2col_int8_1x1(int8_t* input, long input_xy, int8_t* col, long col_cnt, long input_chan);
 void im2col_int8_3x3(int8_t* input, long input_x, long input_y, long input_chan, int8_t* col, long stride);
 // col_start and col_end need to be 16 aligned
@@ -50,10 +49,10 @@ static void i8gemm4x16(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
     int kernel_size_aligned2 = (kernel_size + 1) & -2;
 
 #pragma omp parallel for num_threads(num_thread)
-    for(int kernel_num = (kernel_start & -16); kernel_num < (kernel_end & -16); kernel_num += 16)
+    for (int kernel_num = (kernel_start & -16); kernel_num < (kernel_end & -16); kernel_num += 16)
     {
         int* cur_biases = NULL;
-        if(bias_term)
+        if (bias_term)
         {
             cur_biases = biases + kernel_num;
         }
@@ -67,24 +66,24 @@ static void i8gemm4x16(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
         int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2;
         int8_t* output_result = output + kernel_num * output_xy;
 
-        for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
+        for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
         {
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
-            
+
             i8gemm_4x16_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, output_result + col_line, pmulti,
-                              output_xy, pq_shift, activation_min, activation_max);
+                                 output_xy, pq_shift, activation_min, activation_max);
         }
 
-        if(col_end3)
+        if (col_end3)
         {
             int col_line = col_end & -4;
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x16_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max);
 
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
-                for(int j = 0; j < 4; j++)
+                for (int j = 0; j < 4; j++)
                 {
                     output_line[j] = output + (kernel_num + i * 4 + j) * output_xy + col_line;
                 }
@@ -94,14 +93,14 @@ static void i8gemm4x16(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
                 *(output_line[2] + 0) = result[i * 16 + 10];
                 *(output_line[3] + 0) = result[i * 16 + 15];
 
-                if((col_end3) >= 2)
+                if ((col_end3) >= 2)
                 {
                     *(output_line[0] + 1) = result[i * 16 + 4];
                     *(output_line[1] + 1) = result[i * 16 + 1];
                     *(output_line[2] + 1) = result[i * 16 + 14];
                     *(output_line[3] + 1) = result[i * 16 + 11];
                 }
-                if((col_end3) == 3)
+                if ((col_end3) == 3)
                 {
                     *(output_line[0] + 2) = result[i * 16 + 8];
                     *(output_line[1] + 2) = result[i * 16 + 13];
@@ -123,10 +122,10 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
     int kernel_size_aligned2 = (kernel_size + 1) & -2;
 
 #pragma omp parallel for num_threads(num_thread)
-    for(int kernel_num = kernel_start & -4; kernel_num < (kernel_end & -4); kernel_num += 4)
+    for (int kernel_num = kernel_start & -4; kernel_num < (kernel_end & -4); kernel_num += 4)
     {
         int* cur_biases = NULL;
-        if(bias_term)
+        if (bias_term)
         {
             cur_biases = biases + kernel_num;
         }
@@ -140,21 +139,21 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
         int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2;
         int8_t* output_result = output + kernel_num * output_xy;
 
-        for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
+        for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
         {
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x4_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, output_result + col_line, pmulti,
-                              output_xy, pq_shift, activation_min, activation_max);
+                                output_xy, pq_shift, activation_min, activation_max);
         }
-        if(col_end3)
+        if (col_end3)
         {
             int col_line = col_end & -4;
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x4_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max);
 
-            for(int j = 0; j < 4; j++)
+            for (int j = 0; j < 4; j++)
             {
                 output_line[j] = output + (kernel_num + j) * output_xy + col_line;
             }
@@ -164,14 +163,14 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
             *(output_line[2] + 0) = result[10];
             *(output_line[3] + 0) = result[15];
 
-            if(col_end3 >= 2)
+            if (col_end3 >= 2)
             {
                 *(output_line[0] + 1) = result[4];
                 *(output_line[1] + 1) = result[1];
                 *(output_line[2] + 1) = result[14];
                 *(output_line[3] + 1) = result[11];
             }
-            if(col_end3 == 3)
+            if (col_end3 == 3)
             {
                 *(output_line[0] + 2) = result[8];
                 *(output_line[1] + 2) = result[13];
@@ -180,11 +179,11 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
             }
         }
     }
-    if(kernel_end3)
+    if (kernel_end3)
     {
         int kernel_num = kernel_end & -4;
         int* cur_biases = NULL;
-        if(bias_term)
+        if (bias_term)
         {
             cur_biases = biases + kernel_num;
         }
@@ -196,13 +195,13 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
         int* pq_shift = q_shift + kernel_num;
         int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2;
 
-        for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
+        for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
         {
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x4_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max);
 
-            for(int j = 0; j < 4; j++)
+            for (int j = 0; j < 4; j++)
             {
                 output_line[j] = output + (kernel_num + j) * output_xy + col_line;
             }
@@ -212,14 +211,14 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
             *(output_line[0] + 2) = result[8];
             *(output_line[0] + 3) = result[12];
 
-            if(kernel_end3 >= 2)
+            if (kernel_end3 >= 2)
             {
                 *(output_line[1] + 0) = result[5];
                 *(output_line[1] + 1) = result[1];
                 *(output_line[1] + 2) = result[13];
                 *(output_line[1] + 3) = result[9];
             }
-            if(kernel_end3 == 3)
+            if (kernel_end3 == 3)
             {
                 *(output_line[2] + 0) = result[10];
                 *(output_line[2] + 1) = result[14];
@@ -227,37 +226,37 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
                 *(output_line[2] + 3) = result[6];
             }
         }
-        if(col_end3)
+        if (col_end3)
         {
             int col_line = col_end & -4;
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x4_a72_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max);
 
-            for(int j = 0; j < 4; j++)
+            for (int j = 0; j < 4; j++)
             {
                 output_line[j] = output + (kernel_num + j) * output_xy + col_line;
             }
 
             *(output_line[0] + 0) = result[0];
-            if(col_end3 >= 2)
+            if (col_end3 >= 2)
                 *(output_line[0] + 1) = result[4];
-            if(col_end3 == 3)
+            if (col_end3 == 3)
                 *(output_line[0] + 2) = result[8];
-            if(kernel_end3 >= 2)
+            if (kernel_end3 >= 2)
             {
                 *(output_line[1] + 0) = result[5];
-                if(col_end3 >= 2)
+                if (col_end3 >= 2)
                     *(output_line[1] + 1) = result[1];
-                if(col_end3 == 3)
+                if (col_end3 == 3)
                     *(output_line[1] + 2) = result[13];
             }
-            if(kernel_end3 == 3)
+            if (kernel_end3 == 3)
             {
                 *(output_line[2] + 0) = result[10];
-                if(col_end3 >= 2)
+                if (col_end3 >= 2)
                     *(output_line[2] + 1) = result[14];
-                if(col_end3 == 3)
+                if (col_end3 == 3)
                     *(output_line[2] + 2) = result[2];
             }
         }
@@ -279,10 +278,10 @@ static void i8gemm4x8(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
     int kernel_size_aligned2 = (kernel_size + 1) & -2;
 
 #pragma omp parallel for num_threads(num_thread)
-    for(int kernel_num = (kernel_start & -8); kernel_num < (kernel_end & -8); kernel_num += 8)
+    for (int kernel_num = (kernel_start & -8); kernel_num < (kernel_end & -8); kernel_num += 8)
     {
         int* cur_biases = NULL;
-        if(bias_term)
+        if (bias_term)
         {
             cur_biases = biases + kernel_num;
         }
@@ -296,23 +295,23 @@ static void i8gemm4x8(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
         int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2;
         int8_t* output_result = output + kernel_num * output_xy;
 
-        for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
+        for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
         {
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x8_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, output_result + col_line, pmulti,
-                               output_xy, pq_shift, activation_min, activation_max);
+                                output_xy, pq_shift, activation_min, activation_max);
         }
-        if(col_end3)
+        if (col_end3)
         {
             int col_line = col_end & -4;
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x8_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max);
 
-            for(int i = 0; i < 2; i++)
+            for (int i = 0; i < 2; i++)
             {
-                for(int j = 0; j < 4; j++)
+                for (int j = 0; j < 4; j++)
                 {
                     output_line[j] = output + (kernel_num + i * 4 + j) * output_xy + col_line;
                 }
@@ -322,14 +321,14 @@ static void i8gemm4x8(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
                 *(output_line[2] + 0) = result[i * 16 + 10];
                 *(output_line[3] + 0) = result[i * 16 + 15];
 
-                if(col_end3 >= 2)
+                if (col_end3 >= 2)
                 {
                     *(output_line[0] + 1) = result[i * 16 + 4];
                     *(output_line[1] + 1) = result[i * 16 + 1];
                     *(output_line[2] + 1) = result[i * 16 + 14];
                     *(output_line[3] + 1) = result[i * 16 + 11];
                 }
-                if(col_end3 == 3)
+                if (col_end3 == 3)
                 {
                     *(output_line[0] + 2) = result[i * 16 + 8];
                     *(output_line[1] + 2) = result[i * 16 + 13];
@@ -352,10 +351,10 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
     int kernel_size_aligned2 = (kernel_size + 1) & -2;
 
 #pragma omp parallel for num_threads(num_thread)
-    for(int kernel_num = (kernel_start & -4); kernel_num < (kernel_end & -4); kernel_num += 4)
+    for (int kernel_num = (kernel_start & -4); kernel_num < (kernel_end & -4); kernel_num += 4)
     {
         int* cur_biases = NULL;
-        if(bias_term)
+        if (bias_term)
         {
             cur_biases = biases + kernel_num;
         }
@@ -364,27 +363,27 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
         int8_t* output_line[4];
 
         int* pmulti = multi + kernel_num;
-	    int* pq_shift = q_shift + kernel_num;
+        int* pq_shift = q_shift + kernel_num;
 
         int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2;
         int8_t* output_result = output + kernel_num * output_xy;
 
-        for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
+        for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
         {
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x4_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, output_result + col_line, pmulti,
-                               output_xy, pq_shift, activation_min, activation_max);
+                                output_xy, pq_shift, activation_min, activation_max);
         }
 
-        if(col_end3)
+        if (col_end3)
         {
             int col_line = col_end & -4;
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x4_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max);
 
-            for(int j = 0; j < 4; j++)
+            for (int j = 0; j < 4; j++)
             {
                 output_line[j] = output + (kernel_num + j) * output_xy + col_line;
             }
@@ -394,14 +393,14 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
             *(output_line[2] + 0) = result[10];
             *(output_line[3] + 0) = result[15];
 
-            if(col_end3 >= 2)
+            if (col_end3 >= 2)
             {
                 *(output_line[0] + 1) = result[4];
                 *(output_line[1] + 1) = result[1];
                 *(output_line[2] + 1) = result[14];
                 *(output_line[3] + 1) = result[11];
             }
-            if(col_end3 == 3)
+            if (col_end3 == 3)
             {
                 *(output_line[0] + 2) = result[8];
                 *(output_line[1] + 2) = result[13];
@@ -410,11 +409,11 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
             }
         }
     }
-    if(kernel_end3)
+    if (kernel_end3)
     {
         int kernel_num = kernel_end & -4;
         int* cur_biases = NULL;
-        if(bias_term)
+        if (bias_term)
         {
             cur_biases = biases + kernel_num;
         }
@@ -426,13 +425,13 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
         int* pq_shift = q_shift + kernel_num;
         int8_t* cur_kernel = kernel + kernel_num * kernel_size_aligned2;
 
-        for(int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
+        for (int col_line = (col_start & -4); col_line < (col_end & -4); col_line += 4)
         {
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x4_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max);
 
-            for(int j = 0; j < 4; j++)
+            for (int j = 0; j < 4; j++)
             {
                 output_line[j] = output + (kernel_num + j) * output_xy + col_line;
             }
@@ -442,14 +441,14 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
             *(output_line[0] + 2) = result[8];
             *(output_line[0] + 3) = result[12];
 
-            if(kernel_end3 >= 2)
+            if (kernel_end3 >= 2)
             {
                 *(output_line[1] + 0) = result[5];
                 *(output_line[1] + 1) = result[1];
                 *(output_line[1] + 2) = result[13];
                 *(output_line[1] + 3) = result[9];
             }
-            if(kernel_end3 == 3)
+            if (kernel_end3 == 3)
             {
                 *(output_line[2] + 0) = result[10];
                 *(output_line[2] + 1) = result[14];
@@ -457,37 +456,37 @@ static void i8gemm4x4(int8_t* col, int8_t* kernel, bool bias_term, int* biases,
                 *(output_line[2] + 3) = result[6];
             }
         }
-        if(col_end3)
+        if (col_end3)
         {
             int col_line = col_end & -4;
             int8_t* cur_col = col + col_line * kernel_size_aligned2;
 
             i8gemm_4x4_a17_int8(cur_biases, cur_col, cur_kernel, kernel_size_aligned2, (int8_t*)result, pmulti, 0, pq_shift, activation_min, activation_max);
 
-            for(int j = 0; j < 4; j++)
+            for (int j = 0; j < 4; j++)
             {
                 output_line[j] = output + (kernel_num + j) * output_xy + col_line;
             }
 
             *(output_line[0] + 0) = result[0];
-            if(col_end3 >= 2)
+            if (col_end3 >= 2)
                 *(output_line[0] + 1) = result[4];
-            if(col_end3 == 3)
+            if (col_end3 == 3)
                 *(output_line[0] + 2) = result[8];
-            if(kernel_end3 >= 2)
+            if (kernel_end3 >= 2)
             {
                 *(output_line[1] + 0) = result[5];
-                if(col_end3 >= 2)
+                if (col_end3 >= 2)
                     *(output_line[1] + 1) = result[1];
-                if(col_end3 == 3)
+                if (col_end3 == 3)
                     *(output_line[1] + 2) = result[13];
             }
-            if(kernel_end3 == 3)
+            if (kernel_end3 == 3)
             {
                 *(output_line[2] + 0) = result[10];
-                if(col_end3 >= 2)
+                if (col_end3 >= 2)
                     *(output_line[2] + 1) = result[14];
-                if(col_end3 == 3)
+                if (col_end3 == 3)
                     *(output_line[2] + 2) = result[2];
             }
         }
@@ -504,7 +503,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param)
     int out_chan = filter->dims[0] / group;
     int out_chan_align4 = (out_chan + 3) / 4 * 4;
     int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3];
-    int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128;    // caution
+    int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution
 
     return mem_size;
 }
@@ -527,18 +526,18 @@ int int8_conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* m
 }
 int int8_conv_hcl_get_shared_mem_size(struct tensor* input, struct tensor* output, struct conv_param* param)
 {
-    int in_h  = input->dims[2];
-    int in_w  = input->dims[3];
+    int in_h = input->dims[2];
+    int in_w = input->dims[3];
     int out_h = output->dims[2];
     int out_w = output->dims[3];
     int group = param->group;
-    int input_chan  = param->input_channel / group;
+    int input_chan = param->input_channel / group;
     int kernel_size = input_chan * param->kernel_h * param->kernel_w;
-    int out_cstep   = out_h * out_w;      // channel cstep, output_h * output_w
-    int elem_size   = input->elem_size;   // uint8/int8 is 1 byte, fp32 is 4 bytes
+    int out_cstep = out_h * out_w;    // channel cstep, output_h * output_w
+    int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes
 
     out_cstep = (out_cstep + 3) / 4 * 4;
-    
+
     int kernel_size_aligned2 = (kernel_size + 1) & -2;
     int mem_size = elem_size * kernel_size_aligned2 * out_cstep + 128;
 
@@ -553,18 +552,18 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan
     int i, j, k;
 
     // interleave 16 kernels
-    for(i = 0; i < (kernel_chan & -16); i += 16)
+    for (i = 0; i < (kernel_chan & -16); i += 16)
     {
-        for(j = 0; j < 16; j++)
+        for (j = 0; j < 16; j++)
             cur_kernel[j] = kernel + kernel_size * (i + j);
-        for(j = 0; j < (kernel_size & -2); j += 2)
-            for(k = 0; k < 16; k++)
+        for (j = 0; j < (kernel_size & -2); j += 2)
+            for (k = 0; k < 16; k++)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1);
             }
-        if(kernel_size & 0x1)
-            for(k = 0; k < 16; k++)
+        if (kernel_size & 0x1)
+            for (k = 0; k < 16; k++)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                 *(cur_kernel_int8++) = 0;
@@ -572,87 +571,87 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan
     }
 
     // interleave 4 kernels
-    for(i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4)
+    for (i = (kernel_chan & -16); i < (kernel_chan & -4); i += 4)
     {
-        for(j = 0; j < 4; j++)
+        for (j = 0; j < 4; j++)
             cur_kernel[j] = kernel + kernel_size * (i + j);
-        for(j = 0; j < (kernel_size & -2); j += 2)
-            for(k = 0; k < 4; k++)
+        for (j = 0; j < (kernel_size & -2); j += 2)
+            for (k = 0; k < 4; k++)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1);
             }
-        if(kernel_size & 0x1)
-            for(k = 0; k < 4; k++)
+        if (kernel_size & 0x1)
+            for (k = 0; k < 4; k++)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                 *(cur_kernel_int8++) = 0;
             }
     }
     // last 4 kernels
-    if((kernel_chan & 0x3) != 0)
+    if ((kernel_chan & 0x3) != 0)
     {
-        for(j = 0; j < 3; j++)
+        for (j = 0; j < 3; j++)
             cur_kernel[j] = kernel + kernel_size * (i + j);
-        if((kernel_chan & 0x3) == 3)
+        if ((kernel_chan & 0x3) == 3)
         {
-            for(j = 0; j < (kernel_size & -2); j += 2)
+            for (j = 0; j < (kernel_size & -2); j += 2)
             {
-                for(k = 0; k < 3; k++)
+                for (k = 0; k < 3; k++)
                 {
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1);
                 }
-                for(k = 0; k < 2; k++)
+                for (k = 0; k < 2; k++)
                     *(cur_kernel_int8++) = 0;
             }
-            if(kernel_size & 0x1)
+            if (kernel_size & 0x1)
             {
-                for(k = 0; k < 3; k++)
+                for (k = 0; k < 3; k++)
                 {
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                     *(cur_kernel_int8++) = 0;
                 }
-                for(k = 0; k < 2; k++)
+                for (k = 0; k < 2; k++)
                     *(cur_kernel_int8++) = 0;
             }
         }
-        else if((kernel_chan & 0x3) == 2)
+        else if ((kernel_chan & 0x3) == 2)
         {
-            for(j = 0; j < (kernel_size & -2); j += 2)
+            for (j = 0; j < (kernel_size & -2); j += 2)
             {
-                for(k = 0; k < 2; k++)
+                for (k = 0; k < 2; k++)
                 {
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1);
                 }
-                for(k = 0; k < 4; k++)
+                for (k = 0; k < 4; k++)
                     *(cur_kernel_int8++) = 0;
             }
-            if(kernel_size & 0x1)
+            if (kernel_size & 0x1)
             {
-                for(k = 0; k < 2; k++)
+                for (k = 0; k < 2; k++)
                 {
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                     *(cur_kernel_int8++) = 0;
                 }
-                for(k = 0; k < 4; k++)
+                for (k = 0; k < 4; k++)
                     *(cur_kernel_int8++) = 0;
             }
         }
-        else if((kernel_chan & 0x3) == 1)
+        else if ((kernel_chan & 0x3) == 1)
         {
-            for(j = 0; j < (kernel_size & -2); j += 2)
+            for (j = 0; j < (kernel_size & -2); j += 2)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[0] + j);
                 *(cur_kernel_int8++) = *(cur_kernel[0] + j + 1);
-                for(k = 0; k < 6; k++)
+                for (k = 0; k < 6; k++)
                     *(cur_kernel_int8++) = 0;
             }
-            if(kernel_size & 0x1)
+            if (kernel_size & 0x1)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[0] + j);
-                for(k = 0; k < 7; k++)
+                for (k = 0; k < 7; k++)
                     *(cur_kernel_int8++) = 0;
             }
         }
@@ -665,18 +664,18 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan
     int kernel_size1 = kernel_size & 0x1;
 
     // interleave 8 kernels
-    for(i = 0; i < (kernel_chan & -8); i += 8)
+    for (i = 0; i < (kernel_chan & -8); i += 8)
     {
-        for(j = 0; j < 8; j++)
+        for (j = 0; j < 8; j++)
             cur_kernel[j] = kernel + kernel_size * (i + j);
-        for(j = 0; j < (kernel_size & -2); j += 2)
-            for(k = 0; k < 8; k++)
+        for (j = 0; j < (kernel_size & -2); j += 2)
+            for (k = 0; k < 8; k++)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1);
             }
-        if(kernel_size1)
-            for(k = 0; k < 8; k++)
+        if (kernel_size1)
+            for (k = 0; k < 8; k++)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                 *(cur_kernel_int8++) = 0;
@@ -684,87 +683,87 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan
     }
 
     // interleave 4 kernels
-    for(; i < (kernel_chan & -4); i += 4)
+    for (; i < (kernel_chan & -4); i += 4)
     {
-        for(j = 0; j < 4; j++)
+        for (j = 0; j < 4; j++)
             cur_kernel[j] = kernel + kernel_size * (i + j);
-        for(j = 0; j < (kernel_size & -2); j += 2)
-            for(k = 0; k < 4; k++)
+        for (j = 0; j < (kernel_size & -2); j += 2)
+            for (k = 0; k < 4; k++)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1);
             }
-        if(kernel_size1)
-            for(k = 0; k < 4; k++)
+        if (kernel_size1)
+            for (k = 0; k < 4; k++)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                 *(cur_kernel_int8++) = 0;
             }
     }
     // last 4 kernels
-    if(kernel_chan3)
+    if (kernel_chan3)
     {
-        for(j = 0; j < 3; j++)
+        for (j = 0; j < 3; j++)
             cur_kernel[j] = kernel + kernel_size * (i + j);
-        if((kernel_chan3) == 3)
+        if ((kernel_chan3) == 3)
         {
-            for(j = 0; j < (kernel_size & -2); j += 2)
+            for (j = 0; j < (kernel_size & -2); j += 2)
             {
-                for(k = 0; k < 3; k++)
+                for (k = 0; k < 3; k++)
                 {
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1);
                 }
-                for(k = 0; k < 2; k++)
+                for (k = 0; k < 2; k++)
                     *(cur_kernel_int8++) = 0;
             }
-            if(kernel_size1)
+            if (kernel_size1)
             {
-                for(k = 0; k < 3; k++)
+                for (k = 0; k < 3; k++)
                 {
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                     *(cur_kernel_int8++) = 0;
                 }
-                for(k = 0; k < 2; k++)
+                for (k = 0; k < 2; k++)
                     *(cur_kernel_int8++) = 0;
             }
         }
-        else if((kernel_chan3) == 2)
+        else if ((kernel_chan3) == 2)
         {
-            for(j = 0; j < (kernel_size & -2); j += 2)
+            for (j = 0; j < (kernel_size & -2); j += 2)
             {
-                for(k = 0; k < 2; k++)
+                for (k = 0; k < 2; k++)
                 {
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j + 1);
                 }
-                for(k = 0; k < 4; k++)
+                for (k = 0; k < 4; k++)
                     *(cur_kernel_int8++) = 0;
             }
-            if(kernel_size1)
+            if (kernel_size1)
             {
-                for(k = 0; k < 2; k++)
+                for (k = 0; k < 2; k++)
                 {
                     *(cur_kernel_int8++) = *(cur_kernel[k] + j);
                     *(cur_kernel_int8++) = 0;
                 }
-                for(k = 0; k < 4; k++)
+                for (k = 0; k < 4; k++)
                     *(cur_kernel_int8++) = 0;
             }
         }
         else
-        {    // kernel_chan & 0x3 == 1
-            for(j = 0; j < (kernel_size & -2); j += 2)
+        { // kernel_chan & 0x3 == 1
+            for (j = 0; j < (kernel_size & -2); j += 2)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[0] + j);
                 *(cur_kernel_int8++) = *(cur_kernel[0] + j + 1);
-                for(k = 0; k < 6; k++)
+                for (k = 0; k < 6; k++)
                     *(cur_kernel_int8++) = 0;
             }
-            if(kernel_size1)
+            if (kernel_size1)
             {
                 *(cur_kernel_int8++) = *(cur_kernel[0] + j);
-                for(k = 0; k < 7; k++)
+                for (k = 0; k < 7; k++)
                     *(cur_kernel_int8++) = 0;
             }
         }
@@ -776,9 +775,9 @@ void interleave_kernel_int8(int8_t* kernel, int8_t* kernel_int8, int kernel_chan
 /* kernel interleave */
 static void interleave_int8(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param)
 {
-    int group       = param->group;
+    int group = param->group;
     int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3];
-    int out_chan    = filter->dims[0] / group;
+    int out_chan = filter->dims[0] / group;
     int out_chan_align4 = (out_chan + 3) / 4 * 4;
 
     int kernel_size_algin = kernel_size * out_chan_align4;
@@ -788,15 +787,14 @@ static void interleave_int8(struct tensor* filter, struct conv_priv_info* priv_i
     int8_t* interleave_buf = priv_info->interleave_buffer;
     for (int g = 0; g < group; g++)
     {
-        int8_t* cur_kernel     = kernel + g * kernel_size_group;
+        int8_t* cur_kernel = kernel + g * kernel_size_group;
         int8_t* cur_interleave = interleave_buf + g * kernel_size_algin;
         interleave_kernel_int8(cur_kernel, cur_interleave, out_chan, kernel_size);
     }
 }
 
-
 static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, int input_y, int kernel_x, int kernel_y, int stride_x, int stride_y, int dilation_x,
-                   int dilation_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1, int output_x, int output_y, int num_thread)
+                        int dilation_y, int pad_x0, int pad_x1, int pad_y0, int pad_y1, int output_x, int output_y, int num_thread)
 {
     int col_start = 0;
     int col_end = output_x * output_y;
@@ -813,21 +811,21 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
 
 #ifdef __aarch64__
     // is 1x1
-    if(is_1x1)
+    if (is_1x1)
     {
         int8_t* cur_col = col + col_start * kernel_size_aligned2;
         int col_cnt = (col_end & -4) - (col_start & -4);
-        im2col_int8_1x1(( int8_t* )im + col_start, input_xy, cur_col, col_cnt, kernel_size);
+        im2col_int8_1x1((int8_t*)im + col_start, input_xy, cur_col, col_cnt, kernel_size);
         cur_col += col_cnt * kernel_size_aligned2;
         int col_i = col_end & -4;
         // final 4 input
-        if(col_end3)
+        if (col_end3)
         {
-            for(int kch = 0; kch < (kernel_size & -2); kch += 2)
+            for (int kch = 0; kch < (kernel_size & -2); kch += 2)
             {
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
-                    if((col_i + i) < col_end)
+                    if ((col_i + i) < col_end)
                     {
                         *cur_col++ = *(im + input_xy * (kch + 0) + col_i + i);
                         *cur_col++ = *(im + input_xy * (kch + 1) + col_i + i);
@@ -840,11 +838,11 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                 }
             }
             int kch = kernel_size & -2;
-            if(kernel_size1)
+            if (kernel_size1)
             {
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
-                    if((col_i + i) < col_end)
+                    if ((col_i + i) < col_end)
                     {
                         *cur_col++ = *(im + input_xy * (kch + 0) + col_i + i);
                         *cur_col++ = 0;
@@ -859,10 +857,10 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         }
     }
     // 3x3 non dilation
-    else if(is_3x3)
+    else if (is_3x3)
     {
 #pragma omp parallel for num_threads(num_thread)
-        for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
         {
             int imx[4] = {0};
             int imy[4] = {0};
@@ -872,17 +870,16 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             int imy_start[4] = {0};
             int8_t* cur_col = col + col_i * kernel_size_aligned2;
 
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
                 imx_start[i] = cnt_x[i] * stride_x - pad_x0;
                 imy_start[i] = cnt_y[i] * stride_y - pad_y0;
             }
-            if((cnt_y[0] == cnt_y[3]) &&
-               (is_pad0 || (cnt_y[0] > 0 && cnt_x[0] > 0 && cnt_y[0] < (output_y - 1) && cnt_x[3] < (output_x - 1))))
+            if ((cnt_y[0] == cnt_y[3]) && (is_pad0 || (cnt_y[0] > 0 && cnt_x[0] > 0 && cnt_y[0] < (output_y - 1) && cnt_x[3] < (output_x - 1))))
             {
-                int8_t* input_start = ( int8_t* )(im + imy_start[0] * input_x + imx_start[0]);
+                int8_t* input_start = (int8_t*)(im + imy_start[0] * input_x + imx_start[0]);
                 im2col_int8_3x3(input_start, input_x, input_y, input_chan, cur_col, stride_x);
                 cur_col += 4 * kernel_size_aligned2;
             }
@@ -891,32 +888,32 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                 bool odd_line = false;
                 int kchp = 0;
                 int kyp = 0;
-                for(int kch = 0; kch < input_chan; kch++)
+                for (int kch = 0; kch < input_chan; kch++)
                 {
-                    for(int ky = 0; ky < 3; ky++)
+                    for (int ky = 0; ky < 3; ky++)
                     {
-                        if(odd_line)
+                        if (odd_line)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
                                 imy[i] = imy_start[i] + kyp;
                                 imx[i] = imx_start[i] + 2;
-                                if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                     *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                                 else
                                     *cur_col++ = 0;
                                 imy[i] = imy_start[i] + ky;
-                                if(imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
+                                if (imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
                                     *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]);
                                 else
                                     *cur_col++ = 0;
                             }
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + 1 + k;
-                                    if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                    if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -927,14 +924,14 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                         // even line  2n
                         else
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                                 imy[i] = imy_start[i] + ky;
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + k;
-                                    if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                    if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -946,13 +943,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                         }
                     }
                 }
-                if(kernel_size1)
+                if (kernel_size1)
                 {
-                    for(int i = 0; i < 4; i++)
+                    for (int i = 0; i < 4; i++)
                     {
                         imy[i] = imy_start[i] + kyp;
                         imx[i] = imx_start[i] + 2;
-                        if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                        if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                             *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                         else
                             *cur_col++ = 0;
@@ -962,7 +959,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             }
         }
         int col_i = col_end & -4;
-        if(col_end3)
+        if (col_end3)
         {
             int imx[4] = {0};
             int imy[4] = {0};
@@ -971,7 +968,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             int imx_start[4] = {0};
             int imy_start[4] = {0};
             int8_t* cur_col = col + col_i * kernel_size_aligned2;
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
@@ -981,33 +978,33 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             bool odd_line = false;
             int kchp = 0;
             int kyp = 0;
-            for(int kch = 0; kch < input_chan; kch++)
+            for (int kch = 0; kch < input_chan; kch++)
             {
-                for(int ky = 0; ky < 3; ky++)
+                for (int ky = 0; ky < 3; ky++)
                 {
                     // odd line 1 + 2n
-                    if(odd_line)
+                    if (odd_line)
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
                             imy[i] = imy_start[i] + kyp;
                             imx[i] = imx_start[i] + 2;
-                            if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                            if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                             else
                                 *cur_col++ = 0;
                             imy[i] = imy_start[i] + ky;
-                            if((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
+                            if ((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]);
                             else
                                 *cur_col++ = 0;
                         }
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
-                            for(int k = 0; k < 2; k++)
+                            for (int k = 0; k < 2; k++)
                             {
                                 imx[i] = imx_start[i] + (1 + k);
-                                if((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                if ((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                     *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                 else
                                     *cur_col++ = 0;
@@ -1018,14 +1015,14 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     // even line  2n + 1
                     else
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                             imy[i] = imy_start[i] + ky;
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
-                            for(int k = 0; k < 2; k++)
+                            for (int k = 0; k < 2; k++)
                             {
                                 imx[i] = imx_start[i] + k;
-                                if(i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                if (i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                     *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                 else
                                     *cur_col++ = 0;
@@ -1037,13 +1034,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     }
                 }
             }
-            if(kernel_size1)
+            if (kernel_size1)
             {
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imy[i] = imy_start[i] + kyp;
                     imx[i] = imx_start[i] + 2;
-                    if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                    if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                         *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                     else
                         *cur_col++ = 0;
@@ -1053,43 +1050,43 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         }
     }
     // general case for kernel size <=3
-    else if((kernel_x) < 4 && (kernel_y < 4))
+    else if ((kernel_x) < 4 && (kernel_y < 4))
     {
         int kch[2], kx[2], ky[2], imx[4][2], imy[4][2];
         int8_t* cur_col = col + col_start * kernel_size_aligned2;
-        for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
         {
             int cnt_x[4] = {0};
             int cnt_y[4] = {0};
             int imx_start[4] = {0};
             int imy_start[4] = {0};
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
                 imx_start[i] = cnt_x[i] * stride_x - pad_x0;
                 imy_start[i] = cnt_y[i] * stride_y - pad_y0;
             }
-            for(int col_j = 0; col_j < (kernel_size & -2); col_j += 2)
+            for (int col_j = 0; col_j < (kernel_size & -2); col_j += 2)
             {
-                for(int k = 0; k < 2; k++)
+                for (int k = 0; k < 2; k++)
                 {
                     kch[k] = (col_j + k) / kernel_xy;
                     ky[k] = (col_j + k - kch[k] * kernel_xy) / kernel_x;
                     kx[k] = (col_j + k - kch[k] * kernel_xy) - ky[k] * kernel_x;
                     ky[k] = ky[k] * dilation_y;
                     kx[k] = kx[k] * dilation_x;
-                    for(int i = 0; i < 4; i++)
+                    for (int i = 0; i < 4; i++)
                     {
                         imx[i][k] = imx_start[i] + kx[k];
                         imy[i][k] = imy_start[i] + ky[k];
                     }
                 }
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
-                    for(int k = 0; k < 2; k++)
+                    for (int k = 0; k < 2; k++)
                     {
-                        if(imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y)
+                        if (imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y)
                             *cur_col++ = *(im + input_xy * kch[k] + input_x * imy[i][k] + imx[i][k]);
                         else
                             *cur_col++ = 0;
@@ -1097,18 +1094,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                 }
             }
             int col_j = kernel_size & -2;
-            if(kernel_size1)
+            if (kernel_size1)
             {
                 kch[0] = col_j / kernel_xy;
                 ky[0] = (col_j - kch[0] * kernel_xy) / kernel_x;
                 kx[0] = col_j - kch[0] * kernel_xy - ky[0] * kernel_x;
                 ky[0] = ky[0] * dilation_y;
                 kx[0] = kx[0] * dilation_x;
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imx[i][0] = imx_start[i] + kx[0];
                     imy[i][0] = imy_start[i] + ky[0];
-                    if(imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y)
+                    if (imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y)
                         *cur_col++ = *(im + input_xy * kch[0] + input_x * imy[i][0] + imx[i][0]);
                     else
                         *cur_col++ = 0;
@@ -1118,40 +1115,39 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         }
         int col_i = col_end & -4;
         // final 4 input
-        if(col_end3)
+        if (col_end3)
         {
             int cnt_x[4] = {0};
             int cnt_y[4] = {0};
             int imx_start[4] = {0};
             int imy_start[4] = {0};
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
                 imx_start[i] = cnt_x[i] * stride_x - pad_x0;
                 imy_start[i] = cnt_y[i] * stride_y - pad_y0;
             }
-            for(int col_j = 0; col_j < (kernel_size & -2); col_j += 2)
+            for (int col_j = 0; col_j < (kernel_size & -2); col_j += 2)
             {
-                for(int k = 0; k < 2; k++)
+                for (int k = 0; k < 2; k++)
                 {
                     kch[k] = (col_j + k) / kernel_xy;
                     ky[k] = (col_j + k - kch[k] * kernel_xy) / kernel_x;
                     kx[k] = (col_j + k - kch[k] * kernel_xy) - ky[k] * kernel_x;
                     ky[k] = ky[k] * dilation_y;
                     kx[k] = kx[k] * dilation_x;
-                    for(int i = 0; i < 4; i++)
+                    for (int i = 0; i < 4; i++)
                     {
                         imx[i][k] = imx_start[i] + kx[k];
                         imy[i][k] = imy_start[i] + ky[k];
                     }
                 }
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
-                    for(int k = 0; k < 2; k++)
+                    for (int k = 0; k < 2; k++)
                     {
-                        if((col_i + i) < col_end && imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 &&
-                           imy[i][k] < input_y)
+                        if ((col_i + i) < col_end && imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y)
                             *cur_col++ = *(im + input_xy * kch[k] + input_x * imy[i][k] + imx[i][k]);
                         else
                             *cur_col++ = 0;
@@ -1159,18 +1155,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                 }
             }
             int col_j = kernel_size & -2;
-            if(kernel_size1)
+            if (kernel_size1)
             {
                 kch[0] = col_j / kernel_xy;
                 ky[0] = (col_j - kch[0] * kernel_xy) / kernel_x;
                 kx[0] = col_j - kch[0] * kernel_xy - ky[0] * kernel_x;
                 ky[0] = ky[0] * dilation_y;
                 kx[0] = kx[0] * dilation_x;
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imx[i][0] = imx_start[i] + kx[0];
                     imy[i][0] = imy_start[i] + ky[0];
-                    if((col_i + i) < col_end && imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y)
+                    if ((col_i + i) < col_end && imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y)
                         *cur_col++ = *(im + input_xy * kch[0] + input_x * imy[i][0] + imx[i][0]);
                     else
                         *cur_col++ = 0;
@@ -1185,13 +1181,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         int kch, kx, ky, kchp, kyp, imx[4], imy[4] = {0};
         int kernel_x1 = kernel_x & 0x1;
         int8_t* cur_col = col + col_start * kernel_size_aligned2;
-        for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
         {
             int cnt_x[4] = {0};
             int cnt_y[4] = {0};
             int imx_start[4] = {0};
             int imy_start[4] = {0};
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
@@ -1201,35 +1197,35 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             bool odd_line = false;
             kchp = 0;
             kyp = 0;
-            for(int kch = 0; kch < input_chan; kch++)
+            for (int kch = 0; kch < input_chan; kch++)
             {
-                for(ky = 0; ky < kernel_y; ky++)
+                for (ky = 0; ky < kernel_y; ky++)
                 {
                     // odd line 2 + 2n
-                    if(odd_line)
+                    if (odd_line)
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
                             imy[i] = imy_start[i] + kyp * dilation_y;
                             imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x;
-                            if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                            if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                             else
                                 *cur_col++ = 0;
                             imy[i] = imy_start[i] + ky * dilation_y;
-                            if(imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
+                            if (imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]);
                             else
                                 *cur_col++ = 0;
                         }
-                        for(kx = 1; kx < kernel_x; kx += 2)
+                        for (kx = 1; kx < kernel_x; kx += 2)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + (kx + k) * dilation_x;
-                                    if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                    if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -1241,16 +1237,16 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     // even line  2n
                     else
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                             imy[i] = imy_start[i] + ky * dilation_y;
-                        for(kx = 0; kx < (kernel_x - 1); kx += 2)
+                        for (kx = 0; kx < (kernel_x - 1); kx += 2)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + (kx + k) * dilation_x;
-                                    if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                    if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -1263,13 +1259,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     }
                 }
             }
-            if(kernel_size1)
+            if (kernel_size1)
             {
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imy[i] = imy_start[i] + kyp * dilation_y;
                     imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x;
-                    if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                    if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                         *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                     else
                         *cur_col++ = 0;
@@ -1279,13 +1275,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         }
         int col_i = col_end & -4;
         // final 4 input
-        if(col_end3)
+        if (col_end3)
         {
             int cnt_x[4] = {0};
             int cnt_y[4] = {0};
             int imx_start[4] = {0};
             int imy_start[4] = {0};
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
@@ -1295,36 +1291,35 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             bool odd_line = false;
             kchp = 0;
             kyp = 0;
-            for(int kch = 0; kch < input_chan; kch++)
+            for (int kch = 0; kch < input_chan; kch++)
             {
-                for(ky = 0; ky < kernel_y; ky++)
+                for (ky = 0; ky < kernel_y; ky++)
                 {
                     // odd line 1 + 2n
-                    if(odd_line)
+                    if (odd_line)
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
                             imy[i] = imy_start[i] + kyp * dilation_y;
                             imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x;
-                            if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                            if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                             else
                                 *cur_col++ = 0;
                             imy[i] = imy_start[i] + ky * dilation_y;
-                            if((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
+                            if ((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]);
                             else
                                 *cur_col++ = 0;
                         }
-                        for(kx = 1; kx < kernel_x; kx += 2)
+                        for (kx = 1; kx < kernel_x; kx += 2)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + (kx + k) * dilation_x;
-                                    if((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
-                                       imy[i] < input_y)
+                                    if ((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -1336,17 +1331,16 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     // even line  2n + 1
                     else
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                             imy[i] = imy_start[i] + ky * dilation_y;
-                        for(kx = 0; kx < (kernel_x - 1); kx += 2)
+                        for (kx = 0; kx < (kernel_x - 1); kx += 2)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + (kx + k) * dilation_x;
-                                    if(i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
-                                       imy[i] < input_y)
+                                    if (i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -1359,13 +1353,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     }
                 }
             }
-            if(kernel_size1)
+            if (kernel_size1)
             {
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imy[i] = imy_start[i] + kyp * dilation_y;
                     imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x;
-                    if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                    if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                         *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                     else
                         *cur_col++ = 0;
@@ -1375,12 +1369,12 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         }
     }
 #else
-    if(is_3x3)
+    if (is_3x3)
     {
         int stride_x2 = stride_x * 2;
         int stride_x3 = stride_x * 3;
-// #pragma omp parallel for num_threads(num_thread)
-        for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        // #pragma omp parallel for num_threads(num_thread)
+        for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
         {
             int imx[4] = {0};
             int imy[4] = {0};
@@ -1389,23 +1383,22 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             int imx_start[4] = {0};
             int imy_start[4] = {0};
             int8_t* cur_col = col + col_i * kernel_size_aligned2;
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
                 imx_start[i] = cnt_x[i] * stride_x - pad_x0;
                 imy_start[i] = cnt_y[i] * stride_y - pad_y0;
             }
-            if((cnt_y[0] == cnt_y[3]) &&
-               (is_pad0 || (cnt_y[0] > 0 && cnt_x[0] > 0 && cnt_y[0] < (output_y - 1) && cnt_x[3] < (output_x - 1))))
+            if ((cnt_y[0] == cnt_y[3]) && (is_pad0 || (cnt_y[0] > 0 && cnt_x[0] > 0 && cnt_y[0] < (output_y - 1) && cnt_x[3] < (output_x - 1))))
             {
-                int8_t* l00 = ( int8_t* )(im + imy_start[0] * input_x + imx_start[0]);
+                int8_t* l00 = (int8_t*)(im + imy_start[0] * input_x + imx_start[0]);
                 int8_t* l01 = l00 + input_x;
                 int8_t* l02 = l00 + input_x * 2;
                 int8_t* l10 = l00 + input_xy;
                 int8_t* l11 = l10 + input_x;
                 int8_t* l12 = l10 + input_x * 2;
-                for(int kch = 0; kch < (input_chan & -2); kch += 2)
+                for (int kch = 0; kch < (input_chan & -2); kch += 2)
                 {
                     cur_col[0] = l00[0];
                     cur_col[1] = l00[1];
@@ -1487,7 +1480,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     l11 += input_xy * 2;
                     l12 += input_xy * 2;
                 }
-                if(input_chan & 0x1)
+                if (input_chan & 0x1)
                 {
                     cur_col[0] = l00[0];
                     cur_col[1] = l00[1];
@@ -1536,32 +1529,32 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                 bool odd_line = false;
                 int kchp = 0;
                 int kyp = 0;
-                for(int kch = 0; kch < input_chan; kch++)
+                for (int kch = 0; kch < input_chan; kch++)
                 {
-                    for(int ky = 0; ky < 3; ky++)
+                    for (int ky = 0; ky < 3; ky++)
                     {
-                        if(odd_line)
+                        if (odd_line)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
                                 imy[i] = imy_start[i] + kyp;
                                 imx[i] = imx_start[i] + 2;
-                                if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                     *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                                 else
                                     *cur_col++ = 0;
                                 imy[i] = imy_start[i] + ky;
-                                if(imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
+                                if (imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
                                     *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]);
                                 else
                                     *cur_col++ = 0;
                             }
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + 1 + k;
-                                    if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                    if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -1572,14 +1565,14 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                         // even line  2n
                         else
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                                 imy[i] = imy_start[i] + ky;
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + k;
-                                    if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                    if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -1591,13 +1584,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                         }
                     }
                 }
-                if(kernel_size1)
+                if (kernel_size1)
                 {
-                    for(int i = 0; i < 4; i++)
+                    for (int i = 0; i < 4; i++)
                     {
                         imy[i] = imy_start[i] + kyp;
                         imx[i] = imx_start[i] + 2;
-                        if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                        if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                             *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                         else
                             *cur_col++ = 0;
@@ -1607,7 +1600,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             }
         }
         int col_i = col_end & -4;
-        if(col_end3)
+        if (col_end3)
         {
             int imx[4] = {0};
             int imy[4] = {0};
@@ -1616,7 +1609,7 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             int imx_start[4] = {0};
             int imy_start[4] = {0};
             int8_t* cur_col = col + col_i * kernel_size_aligned2;
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
@@ -1626,33 +1619,33 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             bool odd_line = false;
             int kchp = 0;
             int kyp = 0;
-            for(int kch = 0; kch < input_chan; kch++)
+            for (int kch = 0; kch < input_chan; kch++)
             {
-                for(int ky = 0; ky < 3; ky++)
+                for (int ky = 0; ky < 3; ky++)
                 {
                     // odd line 1 + 2n
-                    if(odd_line)
+                    if (odd_line)
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
                             imy[i] = imy_start[i] + kyp;
                             imx[i] = imx_start[i] + 2;
-                            if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                            if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                             else
                                 *cur_col++ = 0;
                             imy[i] = imy_start[i] + ky;
-                            if((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
+                            if ((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]);
                             else
                                 *cur_col++ = 0;
                         }
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
-                            for(int k = 0; k < 2; k++)
+                            for (int k = 0; k < 2; k++)
                             {
                                 imx[i] = imx_start[i] + (1 + k);
-                                if((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                if ((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                     *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                 else
                                     *cur_col++ = 0;
@@ -1663,14 +1656,14 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     // even line  2n + 1
                     else
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                             imy[i] = imy_start[i] + ky;
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
-                            for(int k = 0; k < 2; k++)
+                            for (int k = 0; k < 2; k++)
                             {
                                 imx[i] = imx_start[i] + k;
-                                if(i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                if (i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                     *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                 else
                                     *cur_col++ = 0;
@@ -1682,13 +1675,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     }
                 }
             }
-            if(kernel_size1)
+            if (kernel_size1)
             {
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imy[i] = imy_start[i] + kyp;
                     imx[i] = imx_start[i] + 2;
-                    if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                    if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                         *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                     else
                         *cur_col++ = 0;
@@ -1698,43 +1691,43 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         }
     }
     // general case for kernel size <=3
-    else if((kernel_x) < 4 && (kernel_y < 4))
+    else if ((kernel_x) < 4 && (kernel_y < 4))
     {
         int kch[2], kx[2], ky[2], imx[4][2], imy[4][2];
-        for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
         {
             int cnt_x[4] = {0};
             int cnt_y[4] = {0};
             int imx_start[4] = {0};
             int imy_start[4] = {0};
             int8_t* cur_col = col + col_i * kernel_size_aligned2;
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
                 imx_start[i] = cnt_x[i] * stride_x - pad_x0;
                 imy_start[i] = cnt_y[i] * stride_y - pad_y0;
             }
-            for(int col_j = 0; col_j < (kernel_size & -2); col_j += 2)
+            for (int col_j = 0; col_j < (kernel_size & -2); col_j += 2)
             {
-                for(int k = 0; k < 2; k++)
+                for (int k = 0; k < 2; k++)
                 {
                     kch[k] = (col_j + k) / kernel_xy;
                     ky[k] = (col_j + k - kch[k] * kernel_xy) / kernel_x;
                     kx[k] = (col_j + k - kch[k] * kernel_xy) - ky[k] * kernel_x;
                     ky[k] = ky[k] * dilation_y;
                     kx[k] = kx[k] * dilation_x;
-                    for(int i = 0; i < 4; i++)
+                    for (int i = 0; i < 4; i++)
                     {
                         imx[i][k] = imx_start[i] + kx[k];
                         imy[i][k] = imy_start[i] + ky[k];
                     }
                 }
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
-                    for(int k = 0; k < 2; k++)
+                    for (int k = 0; k < 2; k++)
                     {
-                        if(imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y)
+                        if (imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y)
                             *cur_col++ = *(im + input_xy * kch[k] + input_x * imy[i][k] + imx[i][k]);
                         else
                             *cur_col++ = 0;
@@ -1742,18 +1735,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                 }
             }
             int col_j = kernel_size & -2;
-            if(kernel_size1)
+            if (kernel_size1)
             {
                 kch[0] = col_j / kernel_xy;
                 ky[0] = (col_j - kch[0] * kernel_xy) / kernel_x;
                 kx[0] = col_j - kch[0] * kernel_xy - ky[0] * kernel_x;
                 ky[0] = ky[0] * dilation_y;
                 kx[0] = kx[0] * dilation_x;
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imx[i][0] = imx_start[i] + kx[0];
                     imy[i][0] = imy_start[i] + ky[0];
-                    if(imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y)
+                    if (imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y)
                         *cur_col++ = *(im + input_xy * kch[0] + input_x * imy[i][0] + imx[i][0]);
                     else
                         *cur_col++ = 0;
@@ -1763,41 +1756,40 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         }
         int col_i = col_end & -4;
         // final 4 input
-        if(col_end3)
+        if (col_end3)
         {
             int cnt_x[4] = {0};
             int cnt_y[4] = {0};
             int imx_start[4] = {0};
             int imy_start[4] = {0};
             int8_t* cur_col = col + col_i * kernel_size_aligned2;
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
                 imx_start[i] = cnt_x[i] * stride_x - pad_x0;
                 imy_start[i] = cnt_y[i] * stride_y - pad_y0;
             }
-            for(int col_j = 0; col_j < (kernel_size & -2); col_j += 2)
+            for (int col_j = 0; col_j < (kernel_size & -2); col_j += 2)
             {
-                for(int k = 0; k < 2; k++)
+                for (int k = 0; k < 2; k++)
                 {
                     kch[k] = (col_j + k) / kernel_xy;
                     ky[k] = (col_j + k - kch[k] * kernel_xy) / kernel_x;
                     kx[k] = (col_j + k - kch[k] * kernel_xy) - ky[k] * kernel_x;
                     ky[k] = ky[k] * dilation_y;
                     kx[k] = kx[k] * dilation_x;
-                    for(int i = 0; i < 4; i++)
+                    for (int i = 0; i < 4; i++)
                     {
                         imx[i][k] = imx_start[i] + kx[k];
                         imy[i][k] = imy_start[i] + ky[k];
                     }
                 }
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
-                    for(int k = 0; k < 2; k++)
+                    for (int k = 0; k < 2; k++)
                     {
-                        if((col_i + i) < col_end && imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 &&
-                           imy[i][k] < input_y)
+                        if ((col_i + i) < col_end && imx[i][k] >= 0 && imx[i][k] < input_x && imy[i][k] >= 0 && imy[i][k] < input_y)
                             *cur_col++ = *(im + input_xy * kch[k] + input_x * imy[i][k] + imx[i][k]);
                         else
                             *cur_col++ = 0;
@@ -1805,19 +1797,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                 }
             }
             int col_j = kernel_size & -2;
-            if(kernel_size1)
+            if (kernel_size1)
             {
                 kch[0] = col_j / kernel_xy;
                 ky[0] = (col_j - kch[0] * kernel_xy) / kernel_x;
                 kx[0] = col_j - kch[0] * kernel_xy - ky[0] * kernel_x;
                 ky[0] = ky[0] * dilation_y;
                 kx[0] = kx[0] * dilation_x;
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imx[i][0] = imx_start[i] + kx[0];
                     imy[i][0] = imy_start[i] + ky[0];
-                    if((col_i + i) < col_end && imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 &&
-                       imy[i][0] < input_y)
+                    if ((col_i + i) < col_end && imx[i][0] >= 0 && imx[i][0] < input_x && imy[i][0] >= 0 && imy[i][0] < input_y)
                         *cur_col++ = *(im + input_xy * kch[0] + input_x * imy[i][0] + imx[i][0]);
                     else
                         *cur_col++ = 0;
@@ -1832,13 +1823,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         int kch, kx, ky, kchp, kyp, imx[4], imy[4];
         int kernel_x1 = kernel_x & 0x1;
         int8_t* cur_col = col + col_start * kernel_size_aligned2;
-        for(int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
+        for (int col_i = (col_start & -4); col_i < (col_end & -4); col_i += 4)
         {
             int cnt_x[4] = {0};
             int cnt_y[4] = {0};
             int imx_start[4] = {0};
             int imy_start[4] = {0};
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
@@ -1848,35 +1839,35 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             bool odd_line = false;
             kchp = 0;
             kyp = 0;
-            for(int kch = 0; kch < input_chan; kch++)
+            for (int kch = 0; kch < input_chan; kch++)
             {
-                for(int ky = 0; ky < kernel_y; ky++)
+                for (int ky = 0; ky < kernel_y; ky++)
                 {
                     // odd line 2 + 2n
-                    if(odd_line)
+                    if (odd_line)
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
                             imy[i] = imy_start[i] + kyp * dilation_y;
                             imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x;
-                            if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                            if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                             else
                                 *cur_col++ = 0;
                             imy[i] = imy_start[i] + ky * dilation_y;
-                            if(imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
+                            if (imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]);
                             else
                                 *cur_col++ = 0;
                         }
-                        for(int kx = 1; kx < kernel_x; kx += 2)
+                        for (int kx = 1; kx < kernel_x; kx += 2)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + (kx + k) * dilation_x;
-                                    if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                    if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -1888,16 +1879,16 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     // even line  2n
                     else
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                             imy[i] = imy_start[i] + ky * dilation_y;
-                        for(int kx = 0; kx < (kernel_x - 1); kx += 2)
+                        for (int kx = 0; kx < (kernel_x - 1); kx += 2)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + (kx + k) * dilation_x;
-                                    if(imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                                    if (imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -1910,13 +1901,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     }
                 }
             }
-            if(kernel_size1)
+            if (kernel_size1)
             {
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imy[i] = imy_start[i] + kyp * dilation_y;
                     imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x;
-                    if(imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                    if (imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                         *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                     else
                         *cur_col++ = 0;
@@ -1926,13 +1917,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
         }
         int col_i = col_end & -4;
         // final 4 input
-        if(col_end3)
+        if (col_end3)
         {
             int cnt_x[4] = {0};
             int cnt_y[4] = {0};
             int imx_start[4] = {0};
             int imy_start[4] = {0};
-            for(int i = 0; i < 4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 cnt_y[i] = (col_i + i) / output_x;
                 cnt_x[i] = col_i + i - cnt_y[i] * output_x;
@@ -1942,36 +1933,35 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
             bool odd_line = false;
             kchp = 0;
             kyp = 0;
-            for(int kch = 0; kch < input_chan; kch++)
+            for (int kch = 0; kch < input_chan; kch++)
             {
-                for(int ky = 0; ky < kernel_y; ky++)
+                for (int ky = 0; ky < kernel_y; ky++)
                 {
                     // odd line 1 + 2n
-                    if(odd_line)
+                    if (odd_line)
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
                             imy[i] = imy_start[i] + kyp * dilation_y;
                             imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x;
-                            if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                            if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                             else
                                 *cur_col++ = 0;
                             imy[i] = imy_start[i] + ky * dilation_y;
-                            if((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
+                            if ((i < col_end3) && imx_start[i] >= 0 && imy[i] >= 0 && imy[i] < input_y)
                                 *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx_start[i]);
                             else
                                 *cur_col++ = 0;
                         }
-                        for(int kx = 1; kx < kernel_x; kx += 2)
+                        for (int kx = 1; kx < kernel_x; kx += 2)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + (kx + k) * dilation_x;
-                                    if((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
-                                       imy[i] < input_y)
+                                    if ((i < col_end3) && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -1983,19 +1973,18 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     // even line  2n + 1
                     else
                     {
-                        for(int i = 0; i < 4; i++)
+                        for (int i = 0; i < 4; i++)
                         {
                             imy[i] = imy_start[i] + ky * dilation_y;
                         }
-                        for(int kx = 0; kx < (kernel_x - 1); kx += 2)
+                        for (int kx = 0; kx < (kernel_x - 1); kx += 2)
                         {
-                            for(int i = 0; i < 4; i++)
+                            for (int i = 0; i < 4; i++)
                             {
-                                for(int k = 0; k < 2; k++)
+                                for (int k = 0; k < 2; k++)
                                 {
                                     imx[i] = imx_start[i] + (kx + k) * dilation_x;
-                                    if(i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 &&
-                                       imy[i] < input_y)
+                                    if (i < col_end3 && imx[i] >= 0 && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                                         *cur_col++ = *(im + input_xy * kch + input_x * imy[i] + imx[i]);
                                     else
                                         *cur_col++ = 0;
@@ -2008,13 +1997,13 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
                     }
                 }
             }
-            if(kernel_size1)
+            if (kernel_size1)
             {
-                for(int i = 0; i < 4; i++)
+                for (int i = 0; i < 4; i++)
                 {
                     imy[i] = imy_start[i] + kyp * dilation_y;
                     imx[i] = imx_start[i] + (kernel_x - 1) * dilation_x;
-                    if((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
+                    if ((i < col_end3) && imx[i] < input_x && imy[i] >= 0 && imy[i] < input_y)
                         *cur_col++ = *(im + input_xy * kchp + input_x * imy[i] + imx[i]);
                     else
                         *cur_col++ = 0;
@@ -2027,9 +2016,8 @@ static void im2col_int8(int8_t* im, int8_t* col, int input_chan, int input_x, in
     return;
 }
 
-
 int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
-                    struct conv_priv_info* priv_info, struct conv_param* param)
+                         struct conv_priv_info* priv_info, struct conv_param* param)
 {
     int in_c = input_tensor->dims[1];
     int in_h = input_tensor->dims[2];
@@ -2043,7 +2031,7 @@ int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
     {
         int mem_size = int8_conv_hcl_get_shared_mem_size(input_tensor, output_tensor, param);
         void* mem = sys_malloc(mem_size);
-        priv_info->im2col_buffer      = mem;
+        priv_info->im2col_buffer = mem;
         priv_info->im2col_buffer_size = mem_size;
     }
     /* alloc mem of kernel interleave */
@@ -2051,7 +2039,7 @@ int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
     {
         int mem_size = get_private_mem_size(filter_tensor, param);
         void* mem = sys_malloc(mem_size);
-        priv_info->interleave_buffer      = mem;
+        priv_info->interleave_buffer = mem;
         priv_info->interleave_buffer_size = mem_size;
     }
     /* kernel interleave */
@@ -2067,19 +2055,19 @@ int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
     priv_info->activation_min = -127;
     priv_info->activation_max = 127;
     /*  set activation   */
-    if(param->activation >= 0)
+    if (param->activation >= 0)
     {
         priv_info->activation_min = 0;
-        if(param->activation == 1)
+        if (param->activation == 1)
             priv_info->activation_max = round(1.0 / output_scale);
-        if(param->activation == 6)
+        if (param->activation == 6)
             priv_info->activation_max = round(6.0 / output_scale);
 
-        if(priv_info->activation_max > 127)
+        if (priv_info->activation_max > 127)
             priv_info->activation_max = 127;
     }
 
-    for(int i=0; i<out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
         float kernel_scale = kernel_scales[i];
         float scale = input_scale * kernel_scale / output_scale;
@@ -2088,7 +2076,7 @@ int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
         float q = frexp(scale, &shift);
         int fix_q = round(q * (1ll << 31));
         // TLOG_ERR("prerun: %f,%lld,%d,%d, %lld\n",q, fix_q, multi, q_shift, 1ll<<31);
-        if(fix_q == (1l << 31))
+        if (fix_q == (1l << 31))
         {
             fix_q /= 2;
             shift++;
@@ -2128,8 +2116,8 @@ int int8_conv_hcl_postrun(struct conv_priv_info* priv_info)
 }
 
 int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                 struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
-                 int num_thread, int cpu_affinity)
+                      struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
+                      int num_thread, int cpu_affinity)
 {
     /* param */
     int group = param->group;
@@ -2165,8 +2153,8 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor,
     int activation_max = priv_info->activation_max;
 
     /* buffer addr */
-    int8_t* input_buf = ( int8_t* )input_tensor->data;
-    int8_t* output_buf = ( int8_t* )output_tensor->data;
+    int8_t* input_buf = (int8_t*)input_tensor->data;
+    int8_t* output_buf = (int8_t*)output_tensor->data;
     int32_t* biases_buf = NULL;
     bool have_biases = false;
     if (bias_tensor != NULL)
@@ -2175,11 +2163,11 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor,
         have_biases = true;
     }
 
-    int8_t* col_buf = ( int8_t* )priv_info->im2col_buffer;
-    int8_t* interleave_buf = ( int8_t* )priv_info->interleave_buffer;
+    int8_t* col_buf = (int8_t*)priv_info->im2col_buffer;
+    int8_t* interleave_buf = (int8_t*)priv_info->interleave_buffer;
 
     /* block size split parameter */
-    int L2_CACHE_SIZE = (cpu_affinity == TENGINE_CLUSTER_LITTLE)? 512 * 1024 : 1024 * 1024;
+    int L2_CACHE_SIZE = (cpu_affinity == TENGINE_CLUSTER_LITTLE) ? 512 * 1024 : 1024 * 1024;
     int kernel_size_l1 = kernel_size;
 #ifdef __aarch64__
     int col_cnt_l2 = L2_CACHE_SIZE * 3 / kernel_size_l1 / 4;
@@ -2188,7 +2176,7 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor,
 #endif
     col_cnt_l2 = col_cnt_l2 > 4 ? (col_cnt_l2 & -4) : 4;
 
-    for (int n = 0; n < batch; n++)    // batch size
+    for (int n = 0; n < batch; n++) // batch size
     {
         int8_t* input = input_buf + n * input_size * group;
         int8_t* output = output_buf + n * output_size * group;
@@ -2197,7 +2185,7 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor,
             int8_t* cur_input = input + g * input_size;
 
             im2col_int8(cur_input, col_buf, in_c, in_w, in_h, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h,
-                   pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread);
+                        pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread);
 
             int kernel_size_aligned2 = (kernel_size + 1) & -2;
             int output_chan_aligned4 = (out_c + 3) & -4;
@@ -2209,25 +2197,25 @@ int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor,
             int* q_shift_g = priv_info->q_shift + g * out_c;
 
             // for input block of L2 cache size
-            for(int col_i = 0; col_i < out_hw; col_i += col_cnt_l2)
+            for (int col_i = 0; col_i < out_hw; col_i += col_cnt_l2)
             {
                 int col_start = col_i;
                 int col_end = col_i + col_cnt_l2;
                 col_end = col_end > out_hw ? out_hw : col_end;
 #ifdef __aarch64__
                 i8gemm4x16(col_buf, kernel_g, have_biases, bias_g, output_g, multi_g, kernel_size, out_hw,
-                            col_start, col_end, 0, out_c & -16, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity);
-                if(out_c & 0xf)
+                           col_start, col_end, 0, out_c & -16, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity);
+                if (out_c & 0xf)
                     i8gemm4x4(col_buf, kernel_g, have_biases, bias_g, output_g, multi_g, kernel_size, out_hw,
-                                col_start, col_end, out_c & -16, out_c, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity);
+                              col_start, col_end, out_c & -16, out_c, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity);
 #else
                 i8gemm4x8(col_buf, kernel_g, have_biases, bias_g, output_g, multi_g, kernel_size, out_hw,
-                            col_start, col_end, 0, out_c & -8, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity);
-                if(out_c & 0x7)
+                          col_start, col_end, 0, out_c & -8, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity);
+                if (out_c & 0x7)
                     i8gemm4x4(col_buf, kernel_g, have_biases, bias_g, output_g, multi_g, kernel_size, out_hw,
-                                col_start, col_end, out_c & -8, out_c, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity);
+                              col_start, col_end, out_c & -8, out_c, activation_min, activation_max, q_shift_g, num_thread, cpu_affinity);
 #endif
-            }    // col_cont
+            } // col_cont
         }
     }
     return 0;
diff --git a/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.h b/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.h
index cb19229be..f9603a273 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.h
+++ b/source/device/cpu/op/conv/cortex-a/conv_kernel_int8_arm.h
@@ -30,14 +30,13 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int int8_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
-                    struct conv_priv_info* priv_info, struct conv_param* param);
+                         struct conv_priv_info* priv_info, struct conv_param* param);
 
 int int8_conv_hcl_postrun(struct conv_priv_info* priv_info);
 
 int int8_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                 struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
-                 int num_thread, int cpu_affinity);
+                      struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
+                      int num_thread, int cpu_affinity);
 
 #endif
diff --git a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.c b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.c
index 6c714c0aa..4d26ead44 100644
--- a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.c
+++ b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.c
@@ -38,16 +38,15 @@
 
 #include <arm_neon.h>
 
-
-#define TILE 4
+#define TILE          4
 #define BLOCK_HW_UNIT 4
-#define ELEM_SIZE ((TILE + 2) * (TILE + 2))
+#define ELEM_SIZE     ((TILE + 2) * (TILE + 2))
 
 #define WINO_MAX(a, b) ((a) > (b) ? (a) : (b))
 #define WINO_MIN(a, b) ((a) < (b) ? (a) : (b))
 
-#define PER_OUT_CHAN 16
-#define KER_COUT_UNIT 16
+#define PER_OUT_CHAN   16
+#define KER_COUT_UNIT  16
 #define KER_COUT_UNIT4 4
 void tran_inp_4(float*, float*, float*, int, int, int);
 void wino_sgemm_4x16_A72(float* output, const float* input, const float* kernel, long cin, short stride_save);
@@ -56,13 +55,17 @@ void wino_sgemm_1x16(float* output, const float* input, const float* kernel, lon
 void wino_sgemm_1x4(float* output, const float* input, const float* kernel, long cin);
 void tran_out_4(float*, float*, int, float*, float*, int);
 
-#define INTERLEAVE_KERNEL_UNIT(cout_idx_p,cout_unit,cin,ker_src,ker_dst,ELEM_SIZE,i,j,s){          \
-    for(i = 0; i < cin; i++){                                                                      \
-        for(j = 0; j < cout_unit; j++){                                                            \
-            *ker_dst = ker_src[((cout_idx_p + j) * cin + i) * ELEM_SIZE + s];                      \
-             ker_dst++;                                                                            \
-        }                                                                                          \
-    }}
+#define INTERLEAVE_KERNEL_UNIT(cout_idx_p, cout_unit, cin, ker_src, ker_dst, ELEM_SIZE, i, j, s) \
+    {                                                                                            \
+        for (i = 0; i < cin; i++)                                                                \
+        {                                                                                        \
+            for (j = 0; j < cout_unit; j++)                                                      \
+            {                                                                                    \
+                *ker_dst = ker_src[((cout_idx_p + j) * cin + i) * ELEM_SIZE + s];                \
+                ker_dst++;                                                                       \
+            }                                                                                    \
+        }                                                                                        \
+    }
 
 static inline void trans_kernel_f43(float* ker, float* trans_ker)
 {
@@ -83,10 +86,10 @@ static inline void trans_kernel_f43(float* ker, float* trans_ker)
     */
     float tmp[18] = {0};
 
-    float neg_r0_add_r2_x_1_6[6];    // (r0+r2)*1./6
-    float r0_1_4_add_r2_x_1_6[6];    // (r0*1/4 + r2)*1./6
-    float r1_1_6[6];    // r1*1/6
-    float r1_1_12[6];    // r1*1/12
+    float neg_r0_add_r2_x_1_6[6]; // (r0+r2)*1./6
+    float r0_1_4_add_r2_x_1_6[6]; // (r0*1/4 + r2)*1./6
+    float r1_1_6[6];              // r1*1/6
+    float r1_1_12[6];             // r1*1/12
     float s_1_6 = 1. / 6.f;
     for (int j = 0; j < 3; j++)
     {
@@ -132,14 +135,14 @@ static inline void transform_kernel_f43_tile(struct tensor* filter, float* trans
 {
     int outc = filter->dims[0];
     int inc = filter->dims[1];
-    float* kernel = ( float* )filter->data;
+    float* kernel = (float*)filter->data;
     float* ker_ptr = trans_ker;
 
     for (int i = 0; i < outc; i++)
     {
         for (int j = 0; j < inc; j++)
         {
-            trans_kernel_f43(( float* )(kernel + 9 * (j + i * inc)), ker_ptr);
+            trans_kernel_f43((float*)(kernel + 9 * (j + i * inc)), ker_ptr);
             ker_ptr += ELEM_SIZE;
         }
     }
@@ -149,22 +152,25 @@ static inline void transform_kernel_f43_tile(struct tensor* filter, float* trans
 // ker1 [ELEM_SIZE][cout//KER_COUT_UNIT][cin][KER_COUT_UNIT]
 static inline void interleave_kernel_1(float* ker0, float* ker1, int cout, int cin)
 {
-    int i,j;
+    int i, j;
     float* ker1_ptr = ker1;
-    for(int s = 0; s < ELEM_SIZE; s++)
+    for (int s = 0; s < ELEM_SIZE; s++)
     {
         int p;
         //cout 16
-        for(p = 0; p < (cout& -KER_COUT_UNIT); p+=KER_COUT_UNIT){
-            INTERLEAVE_KERNEL_UNIT(p,KER_COUT_UNIT,cin,ker0,ker1_ptr,ELEM_SIZE,i,j,s);
+        for (p = 0; p < (cout & -KER_COUT_UNIT); p += KER_COUT_UNIT)
+        {
+            INTERLEAVE_KERNEL_UNIT(p, KER_COUT_UNIT, cin, ker0, ker1_ptr, ELEM_SIZE, i, j, s);
         }
         //cout 4
-        for(p = (cout & -KER_COUT_UNIT); p < (cout & -KER_COUT_UNIT4); p += KER_COUT_UNIT4){
-            INTERLEAVE_KERNEL_UNIT(p,KER_COUT_UNIT4,cin,ker0,ker1_ptr,ELEM_SIZE,i,j,s);
+        for (p = (cout & -KER_COUT_UNIT); p < (cout & -KER_COUT_UNIT4); p += KER_COUT_UNIT4)
+        {
+            INTERLEAVE_KERNEL_UNIT(p, KER_COUT_UNIT4, cin, ker0, ker1_ptr, ELEM_SIZE, i, j, s);
         }
         // cout 1
-        for(p=(cout & -KER_COUT_UNIT4); p < cout; p ++){
-            INTERLEAVE_KERNEL_UNIT(p,1,cin,ker0,ker1_ptr,ELEM_SIZE,i,j,s);
+        for (p = (cout & -KER_COUT_UNIT4); p < cout; p++)
+        {
+            INTERLEAVE_KERNEL_UNIT(p, 1, cin, ker0, ker1_ptr, ELEM_SIZE, i, j, s);
         }
     }
 }
@@ -175,7 +181,7 @@ static inline void pad_input1(const float* input, float* inp_padded, int inc, in
     int padded_hw = padded_h * padded_w;
 
     float* pad_ptr;
-    float* inp_ptr = ( float* )input;
+    float* inp_ptr = (float*)input;
     int resi_h = padded_h - pad0 - inh;
     int resi_w = padded_w - pad1 - inw;
     for (int c = 0; c < inc; c++)
@@ -204,7 +210,7 @@ static inline void pad_input1(const float* input, float* inp_padded, int inc, in
 
 static inline void trans_inp_1tile(float* input, float* inp_ptr, int ih, int jw, int c, int in_hw, int inw)
 {
-    float* inp = ( float* )input + c * in_hw + ih * 4 * inw + jw * 4;
+    float* inp = (float*)input + c * in_hw + ih * 4 * inw + jw * 4;
     float* inp0 = inp;
     float* inp1 = inp0 + inw;
     float* inp2 = inp1 + inw;
@@ -346,19 +352,19 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     float32x4_t line0_4 = vld1q_f32(mid + 16);
     float32x4_t line0_5 = vld1q_f32(mid + 20);
 
-    float32x4_t line1_0 = vsubq_f32(r0, r0_);    // mid[(6 + i) * 4 + k]   [1][0]
-    float32x4_t line1_1 = vsubq_f32(r1, r1_);    // mid[(6 + i) * 4 + k]   [1][1]
-    float32x4_t line1_2 = vsubq_f32(r2, r2_);    // mid[(6 + i) * 4 + k]   [1][2]
-    float32x4_t line1_3 = vsubq_f32(r3, r3_);    // mid[(6 + i) * 4 + k]   [1][3]
-    float32x4_t line1_4 = vsubq_f32(r4, r4_);    // mid[(6 + i) * 4 + k]   [1][4]
-    float32x4_t line1_5 = vsubq_f32(r5, r5_);    // mid[(6 + i) * 4 + k]   [1][5]
+    float32x4_t line1_0 = vsubq_f32(r0, r0_); // mid[(6 + i) * 4 + k]   [1][0]
+    float32x4_t line1_1 = vsubq_f32(r1, r1_); // mid[(6 + i) * 4 + k]   [1][1]
+    float32x4_t line1_2 = vsubq_f32(r2, r2_); // mid[(6 + i) * 4 + k]   [1][2]
+    float32x4_t line1_3 = vsubq_f32(r3, r3_); // mid[(6 + i) * 4 + k]   [1][3]
+    float32x4_t line1_4 = vsubq_f32(r4, r4_); // mid[(6 + i) * 4 + k]   [1][4]
+    float32x4_t line1_5 = vsubq_f32(r5, r5_); // mid[(6 + i) * 4 + k]   [1][5]
 
-    float32x4_t line2_0 = vaddq_f32(r0, r0_);    // mid[(12 + i) * 4 + k]  [2][0]
-    float32x4_t line2_1 = vaddq_f32(r1, r1_);    // mid[(12 + i) * 4 + k]  [2][1]
-    float32x4_t line2_2 = vaddq_f32(r2, r2_);    // mid[(12 + i) * 4 + k]  [2][2]
-    float32x4_t line2_3 = vaddq_f32(r3, r3_);    // mid[(12 + i) * 4 + k]  [2][3]
-    float32x4_t line2_4 = vaddq_f32(r4, r4_);    // mid[(12 + i) * 4 + k]  [2][4]
-    float32x4_t line2_5 = vaddq_f32(r5, r5_);    // mid[(12 + i) * 4 + k]  [2][5]
+    float32x4_t line2_0 = vaddq_f32(r0, r0_); // mid[(12 + i) * 4 + k]  [2][0]
+    float32x4_t line2_1 = vaddq_f32(r1, r1_); // mid[(12 + i) * 4 + k]  [2][1]
+    float32x4_t line2_2 = vaddq_f32(r2, r2_); // mid[(12 + i) * 4 + k]  [2][2]
+    float32x4_t line2_3 = vaddq_f32(r3, r3_); // mid[(12 + i) * 4 + k]  [2][3]
+    float32x4_t line2_4 = vaddq_f32(r4, r4_); // mid[(12 + i) * 4 + k]  [2][4]
+    float32x4_t line2_5 = vaddq_f32(r5, r5_); // mid[(12 + i) * 4 + k]  [2][5]
 
     r0 = vld1q_f32(r4_minus_r2);
     r1 = vld1q_f32(r4_minus_r2 + 4);
@@ -381,19 +387,19 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     float32x4_t line5_4 = vld1q_f32(mid + 136);
     float32x4_t line5_5 = vld1q_f32(mid + 140);
 
-    float32x4_t line3_0 = vsubq_f32(r0, r0_);    // mid[(18 + i) * 4 + k]   [3][0]
-    float32x4_t line3_1 = vsubq_f32(r1, r1_);    // mid[(18 + i) * 4 + k]   [3][1]
-    float32x4_t line3_2 = vsubq_f32(r2, r2_);    // mid[(18 + i) * 4 + k]   [3][2]
-    float32x4_t line3_3 = vsubq_f32(r3, r3_);    // mid[(18 + i) * 4 + k]   [3][3]
-    float32x4_t line3_4 = vsubq_f32(r4, r4_);    // mid[(18 + i) * 4 + k]   [3][4]
-    float32x4_t line3_5 = vsubq_f32(r5, r5_);    // mid[(18 + i) * 4 + k]   [3][5]
+    float32x4_t line3_0 = vsubq_f32(r0, r0_); // mid[(18 + i) * 4 + k]   [3][0]
+    float32x4_t line3_1 = vsubq_f32(r1, r1_); // mid[(18 + i) * 4 + k]   [3][1]
+    float32x4_t line3_2 = vsubq_f32(r2, r2_); // mid[(18 + i) * 4 + k]   [3][2]
+    float32x4_t line3_3 = vsubq_f32(r3, r3_); // mid[(18 + i) * 4 + k]   [3][3]
+    float32x4_t line3_4 = vsubq_f32(r4, r4_); // mid[(18 + i) * 4 + k]   [3][4]
+    float32x4_t line3_5 = vsubq_f32(r5, r5_); // mid[(18 + i) * 4 + k]   [3][5]
 
-    float32x4_t line4_0 = vaddq_f32(r0, r0_);    // mid[(24 + i) * 4 + k]  [4][0]
-    float32x4_t line4_1 = vaddq_f32(r1, r1_);    // mid[(24 + i) * 4 + k]  [4][1]
-    float32x4_t line4_2 = vaddq_f32(r2, r2_);    // mid[(24 + i) * 4 + k]  [4][2]
-    float32x4_t line4_3 = vaddq_f32(r3, r3_);    // mid[(24 + i) * 4 + k]  [4][3]
-    float32x4_t line4_4 = vaddq_f32(r4, r4_);    // mid[(24 + i) * 4 + k]  [4][4]
-    float32x4_t line4_5 = vaddq_f32(r5, r5_);    // mid[(24 + i) * 4 + k]  [4][5]
+    float32x4_t line4_0 = vaddq_f32(r0, r0_); // mid[(24 + i) * 4 + k]  [4][0]
+    float32x4_t line4_1 = vaddq_f32(r1, r1_); // mid[(24 + i) * 4 + k]  [4][1]
+    float32x4_t line4_2 = vaddq_f32(r2, r2_); // mid[(24 + i) * 4 + k]  [4][2]
+    float32x4_t line4_3 = vaddq_f32(r3, r3_); // mid[(24 + i) * 4 + k]  [4][3]
+    float32x4_t line4_4 = vaddq_f32(r4, r4_); // mid[(24 + i) * 4 + k]  [4][4]
+    float32x4_t line4_5 = vaddq_f32(r5, r5_); // mid[(24 + i) * 4 + k]  [4][5]
 
     // r4_minus_r2[i * 4 + k]   i=0     = mid[0][4]
     r0 = vsubq_f32(line0_4, line0_2);
@@ -418,30 +424,30 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     r4_ = vmulq_f32(r4_, const2);
     r5_ = vmulq_f32(r5_, const2);
 
-    vst1q_f32(inp_ptr + s_size * 3, vsubq_f32(r0, r0_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 9, vsubq_f32(r1, r1_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 15, vsubq_f32(r2, r2_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 21, vsubq_f32(r3, r3_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 27, vsubq_f32(r4, r4_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 33, vsubq_f32(r5, r5_));    // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 3, vsubq_f32(r0, r0_));  // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 9, vsubq_f32(r1, r1_));  // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 15, vsubq_f32(r2, r2_)); // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 21, vsubq_f32(r3, r3_)); // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 27, vsubq_f32(r4, r4_)); // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 33, vsubq_f32(r5, r5_)); // inp_ptr[ s_size * (3 + i * 6)]
 
-    vst1q_f32(inp_ptr + s_size * 4, vaddq_f32(r0, r0_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 10, vaddq_f32(r1, r1_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 16, vaddq_f32(r2, r2_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 22, vaddq_f32(r3, r3_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 28, vaddq_f32(r4, r4_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 34, vaddq_f32(r5, r5_));    // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 4, vaddq_f32(r0, r0_));  // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 10, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 16, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 22, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 28, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 34, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (4 + i * 6)]
 
     float32x4_t const4 = vdupq_n_f32(4.f);
     float32x4_t const5 = vdupq_n_f32(-5.f);
-    r0_ = vmulq_f32(line0_1, const4);    // line 1*4 ========
+    r0_ = vmulq_f32(line0_1, const4); // line 1*4 ========
     r1_ = vmulq_f32(line1_1, const4);
     r2_ = vmulq_f32(line2_1, const4);
     r3_ = vmulq_f32(line3_1, const4);
     r4_ = vmulq_f32(line4_1, const4);
     r5_ = vmulq_f32(line5_1, const4);
 
-    float32x4_t rr0_ = vsubq_f32(r0_, line0_3);    // line1*4-line3
+    float32x4_t rr0_ = vsubq_f32(r0_, line0_3); // line1*4-line3
     float32x4_t rr1_ = vsubq_f32(r1_, line1_3);
     float32x4_t rr2_ = vsubq_f32(r2_, line2_3);
     float32x4_t rr3_ = vsubq_f32(r3_, line3_3);
@@ -455,28 +461,28 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     r4 = vmulq_f32(line4_2, const4);
     r5 = vmulq_f32(line5_2, const4);
 
-    r0 = vsubq_f32(line0_4, r0);    // line4 -4*line2
+    r0 = vsubq_f32(line0_4, r0); // line4 -4*line2
     r1 = vsubq_f32(line1_4, r1);
     r2 = vsubq_f32(line2_4, r2);
     r3 = vsubq_f32(line3_4, r3);
     r4 = vsubq_f32(line4_4, r4);
     r5 = vsubq_f32(line5_4, r5);
 
-    vst1q_f32(inp_ptr + s_size * 1, vsubq_f32(r0, rr0_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 7, vsubq_f32(r1, rr1_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 13, vsubq_f32(r2, rr2_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 19, vsubq_f32(r3, rr3_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 25, vsubq_f32(r4, rr4_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 31, vsubq_f32(r5, rr5_));    // inp_ptr[ s_size * (1 + i * 6)]
-
-    vst1q_f32(inp_ptr + s_size * 2, vaddq_f32(r0, rr0_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 8, vaddq_f32(r1, rr1_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 14, vaddq_f32(r2, rr2_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 20, vaddq_f32(r3, rr3_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 26, vaddq_f32(r4, rr4_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 32, vaddq_f32(r5, rr5_));    // inp_ptr[ s_size * (2 + i * 6)]
-
-    r0_ = vaddq_f32(line0_5, r0_);    // 5 + 1*4
+    vst1q_f32(inp_ptr + s_size * 1, vsubq_f32(r0, rr0_));  // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 7, vsubq_f32(r1, rr1_));  // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 13, vsubq_f32(r2, rr2_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 19, vsubq_f32(r3, rr3_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 25, vsubq_f32(r4, rr4_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 31, vsubq_f32(r5, rr5_)); // inp_ptr[ s_size * (1 + i * 6)]
+
+    vst1q_f32(inp_ptr + s_size * 2, vaddq_f32(r0, rr0_));  // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 8, vaddq_f32(r1, rr1_));  // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 14, vaddq_f32(r2, rr2_)); // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 20, vaddq_f32(r3, rr3_)); // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 26, vaddq_f32(r4, rr4_)); // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 32, vaddq_f32(r5, rr5_)); // inp_ptr[ s_size * (2 + i * 6)]
+
+    r0_ = vaddq_f32(line0_5, r0_); // 5 + 1*4
     r1_ = vaddq_f32(line1_5, r1_);
     r2_ = vaddq_f32(line2_5, r2_);
     r3_ = vaddq_f32(line3_5, r3_);
@@ -489,12 +495,12 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     r3 = vmulq_f32(line3_3, const5);
     r4 = vmulq_f32(line4_3, const5);
     r5 = vmulq_f32(line5_3, const5);
-    vst1q_f32(inp_ptr + s_size * 5, vaddq_f32(r0, r0_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 11, vaddq_f32(r1, r1_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 17, vaddq_f32(r2, r2_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 23, vaddq_f32(r3, r3_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 29, vaddq_f32(r4, r4_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 35, vaddq_f32(r5, r5_));    // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 5, vaddq_f32(r0, r0_));  // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 11, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 17, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 23, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 29, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 35, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (5 + i * 6)]
 
     r0 = vmulq_f32(line0_0, const4);
     r1 = vmulq_f32(line1_0, const4);
@@ -517,12 +523,12 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     r4 = vaddq_f32(r4, line4_4);
     r5 = vaddq_f32(r5, line5_4);
 
-    vst1q_f32(inp_ptr + s_size * 0, vaddq_f32(r0, r0_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 6, vaddq_f32(r1, r1_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 12, vaddq_f32(r2, r2_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 18, vaddq_f32(r3, r3_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 24, vaddq_f32(r4, r4_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 30, vaddq_f32(r5, r5_));    // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 0, vaddq_f32(r0, r0_));  // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 6, vaddq_f32(r1, r1_));  // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 12, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 18, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 24, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 30, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (1 + i * 6)]
 
     // for(int i = 0; i < 6; i++)
     // {
@@ -552,10 +558,9 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     // }
 }
 
-
 // trans_input  [block_hw/4][ELEM_SIZE][inc][4]
 static inline void tran_input_4block(const float* input, float* trans_inp, int inc, int block_h,
-                                         int block_w, int inh, int inw)
+                                     int block_w, int inh, int inw)
 {
     int in_hw = inh * inw;
     int block_hw = block_h * block_w;
@@ -577,7 +582,7 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i
 
         if (idxh[0] == idxh[3])
         {
-            float* temp_inp_ptr = ( float* )(input + idxh[0] * 4 * inw + idxw[0] * 4);
+            float* temp_inp_ptr = (float*)(input + idxh[0] * 4 * inw + idxw[0] * 4);
             for (int c = 0; c < inc; c++)
             {
                 float ker00[4] = {1, 2, 4, 5};
@@ -592,13 +597,13 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i
 
             for (int c = 0; c < inc; c++)
             {
-                trans_inp_1tile(( float* )input, buffer, idxh[0], idxw[0], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[0], idxw[0], c, in_hw, inw);
                 buffer += ELEM_SIZE;
-                trans_inp_1tile(( float* )input, buffer, idxh[1], idxw[1], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[1], idxw[1], c, in_hw, inw);
                 buffer += ELEM_SIZE;
-                trans_inp_1tile(( float* )input, buffer, idxh[2], idxw[2], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[2], idxw[2], c, in_hw, inw);
                 buffer += ELEM_SIZE;
-                trans_inp_1tile(( float* )input, buffer, idxh[3], idxw[3], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[3], idxw[3], c, in_hw, inw);
                 buffer += ELEM_SIZE;
             }
             // interleave
@@ -621,7 +626,7 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i
 
 // tran_inp [block_hw/4][36][inc][4] -> [36][block_hw/4][inc][4]
 static inline void tran_input_4block_1(const float* input, float* trans_inp, int inc, int block_h, int block_w, int inh,
-                                       int inw,int num_thread)
+                                       int inw, int num_thread)
 {
     int in_hw = inh * inw;
     int block_hw = block_h * block_w;
@@ -631,8 +636,8 @@ static inline void tran_input_4block_1(const float* input, float* trans_inp, int
 
     int s_size = block_hw * inc * sizeof(float);
 
-#pragma omp parallel for num_threads(num_thread) shared(block_hw,nn_block,in_hw) private(idxh,idxw)
-    for(int ib = 0; ib < nn_block; ib++)
+#pragma omp parallel for num_threads(num_thread) shared(block_hw, nn_block, in_hw) private(idxh, idxw)
+    for (int ib = 0; ib < nn_block; ib++)
     {
         int off_set0 = ib * BLOCK_HW_UNIT * inc;
 
@@ -645,10 +650,10 @@ static inline void tran_input_4block_1(const float* input, float* trans_inp, int
         idxw[2] = (ib * 4 + 2) % block_w;
         idxw[3] = (ib * 4 + 3) % block_w;
 
-        if(idxh[0] == idxh[3])
+        if (idxh[0] == idxh[3])
         {
-            float* temp_inp_ptr = ( float* )(input + idxh[0] * 4 * inw + idxw[0] * 4);
-            for(int c = 0; c < inc; c++)
+            float* temp_inp_ptr = (float*)(input + idxh[0] * 4 * inw + idxw[0] * 4);
+            for (int c = 0; c < inc; c++)
             {
                 float ker00[4] = {1, 2, 4, 5};
                 tran_inp_4(temp_inp_ptr, trans_inp + c * 4 + off_set0, ker00, inw, s_size, in_hw);
@@ -660,24 +665,24 @@ static inline void tran_input_4block_1(const float* input, float* trans_inp, int
             float buffer0[inc * ELEM_SIZE * BLOCK_HW_UNIT];
             float* buffer = buffer0;
 
-            for(int c = 0; c < inc; c++)
+            for (int c = 0; c < inc; c++)
             {
-                trans_inp_1tile(( float* )input, buffer, idxh[0], idxw[0], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[0], idxw[0], c, in_hw, inw);
                 buffer += ELEM_SIZE;
-                trans_inp_1tile(( float* )input, buffer, idxh[1], idxw[1], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[1], idxw[1], c, in_hw, inw);
                 buffer += ELEM_SIZE;
-                trans_inp_1tile(( float* )input, buffer, idxh[2], idxw[2], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[2], idxw[2], c, in_hw, inw);
                 buffer += ELEM_SIZE;
-                trans_inp_1tile(( float* )input, buffer, idxh[3], idxw[3], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[3], idxw[3], c, in_hw, inw);
                 buffer += ELEM_SIZE;
             }
             // interleave
-            for(int s = 0; s < ELEM_SIZE; s++)
+            for (int s = 0; s < ELEM_SIZE; s++)
             {
                 float* tmp_inp = trans_inp + s * block_hw * inc + off_set0;
-                for(int i = 0; i < inc; i++)
+                for (int i = 0; i < inc; i++)
                 {
-                    for(int j = 0; j < BLOCK_HW_UNIT; j++)
+                    for (int j = 0; j < BLOCK_HW_UNIT; j++)
                     {
                         *tmp_inp = buffer0[i * ELEM_SIZE * BLOCK_HW_UNIT + j * ELEM_SIZE + s];
                         tmp_inp++;
@@ -701,7 +706,7 @@ static inline void tran_input_resi_block(const float* input, float* trans_inp, i
         {
             int ih = ib / block_w;
             int jw = ib % block_w;
-            trans_inp_1tile(( float* )input, buffer, ih, jw, c, in_hw, inw);
+            trans_inp_1tile((float*)input, buffer, ih, jw, c, in_hw, inw);
             buffer += ELEM_SIZE;
         }
         // interleave
@@ -717,29 +722,28 @@ static inline void tran_input_resi_block(const float* input, float* trans_inp, i
     }
 }
 
-
 // tran_inp [block_resi][36][inc] -> [36][block_resi][inc]
 static inline void tran_input_resi_block_1(const float* input, float* trans_inp, int inc, int nn_block, int resi_block,
                                            int block_hw, int block_w, int in_hw, int inw)
 {
-    for(int ib = resi_block; ib < block_hw; ib++)
+    for (int ib = resi_block; ib < block_hw; ib++)
     {
         int off_set0 = ib * inc;
 
         float buffer0[ELEM_SIZE * inc];
         float* buffer = buffer0;
-        for(int c = 0; c < inc; c++)
+        for (int c = 0; c < inc; c++)
         {
             int ih = ib / block_w;
             int jw = ib % block_w;
-            trans_inp_1tile(( float* )input, buffer, ih, jw, c, in_hw, inw);
+            trans_inp_1tile((float*)input, buffer, ih, jw, c, in_hw, inw);
             buffer += ELEM_SIZE;
         }
         // interleave
-        for(int s = 0; s < ELEM_SIZE; s++)
+        for (int s = 0; s < ELEM_SIZE; s++)
         {
             float* tmp_inp = trans_inp + s * block_hw * inc + off_set0;
-            for(int i = 0; i < inc; i++)
+            for (int i = 0; i < inc; i++)
             {
                 *tmp_inp = buffer0[i * ELEM_SIZE + s];
                 tmp_inp++;
@@ -749,7 +753,6 @@ static inline void tran_input_resi_block_1(const float* input, float* trans_inp,
     }
 }
 
-
 static inline float do_activation(float value, int activation)
 {
     if (activation >= 0)
@@ -961,8 +964,7 @@ static inline void transform_output_f43_1tile(const float* buffer_ptr, float* ou
         float* out_ptr = out + cout_idx * out_hw;
         int i_h = idx_blockhw / block_w;
         int j_w = idx_blockhw % block_w;
-        if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) ||
-            (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
+        if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
         {
             trans_output_f43(buffer_ptr, out_ptr + (i_h * TILE * outw + j_w * TILE), outw, bias_ptr, activation);
         }
@@ -1016,17 +1018,16 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int
         float* out_ptr = out + cout_idx * out_hw;
         if (bias)
         {
-            bias_ptr = ( float* )bias + cout_idx;
+            bias_ptr = (float*)bias + cout_idx;
         }
         for (int ii = 0; ii < 4; ii++)
         {
             int i_h = idx_h[ii];
             int j_w = idx_w[ii];
-            if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) ||
-                (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
+            if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
             {
                 trans_output_f43(buffer_ptr, out_ptr + (i_h * TILE * outw + j_w * TILE), outw, bias_ptr, activation);
-            }    // direct use_out_ptr
+            } // direct use_out_ptr
             else
             {
                 int ret_h = TILE - resi_h;
@@ -1046,7 +1047,7 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int
                         out_pointer[hh * outw + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation);
                     }
                 }
-            }    // end else, tmp_buff
+            } // end else, tmp_buff
             buffer_ptr += ELEM_SIZE;
         }
     }
@@ -1055,17 +1056,17 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int
 // trans_input  [block_hw/4][ELEM_SIZE][inc][4]
 // kernel       [out_c/PER_OUT_CHAN][ELEM_SIZE][in_c][PER_OUT_CHAN]
 static void wino_sgemm_4x16_1(const float* ker, const float* inp, float* output, int cin, int cout_end,
-                              int block_h, int block_w,  int out_c, int num_thread, int s, int cpu_affinity)
+                              int block_h, int block_w, int out_c, int num_thread, int s, int cpu_affinity)
 {
     int block_hw = block_h * block_w;
 
-    #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
     for (int p = 0; p < (cout_end & -PER_OUT_CHAN); p += PER_OUT_CHAN)
     {
-        float * out_ptr = output + p * ELEM_SIZE * block_hw;
-        float * out_ptr1 ;
+        float* out_ptr = output + p * ELEM_SIZE * block_hw;
+        float* out_ptr1;
         int i;
-        
+
         for (i = 0; i < (block_hw & -4); i += 4)
         {
             out_ptr1 = out_ptr + i * ELEM_SIZE * KER_COUT_UNIT;
@@ -1073,11 +1074,11 @@ static void wino_sgemm_4x16_1(const float* ker, const float* inp, float* output,
             int offset = s * block_hw * cin + i * cin;
             int offset_ker = s * cin * out_c + p * cin;
 
-//#ifdef __aarch64__
+            //#ifdef __aarch64__
             wino_sgemm_4x16_A72(out_ptr1 + s * BLOCK_HW_UNIT, inp + offset, ker + offset_ker, cin, 1);
         }
-        
-        for(; i < block_hw ;i++)
+
+        for (; i < block_hw; i++)
         {
             out_ptr1 = out_ptr + i * ELEM_SIZE * KER_COUT_UNIT;
 
@@ -1090,7 +1091,7 @@ static void wino_sgemm_4x16_1(const float* ker, const float* inp, float* output,
 }
 
 void wino_sgemm_4x4_1(const float* ker, const float* inp, float* output, int cin, int cout_start,
-                    int cout_end, int block_h, int block_w, int out_c, int activation, int s, int num_thread, int cpu_affinity)
+                      int cout_end, int block_h, int block_w, int out_c, int activation, int s, int num_thread, int cpu_affinity)
 {
     int block_start = 0;
     int block_hw = block_h * block_w;
@@ -1102,15 +1103,15 @@ void wino_sgemm_4x4_1(const float* ker, const float* inp, float* output, int cin
         float* out_ptr = output + p * ELEM_SIZE * block_hw;
 
         int i = 0;
-        for(i = (block_start & -4); i < (block_end & -4); i += 4)
+        for (i = (block_start & -4); i < (block_end & -4); i += 4)
         {
             float* out_ptr1 = out_ptr + i * ELEM_SIZE * KER_COUT_UNIT4;
             int offset = s * block_hw * cin + i * cin;
             int offset_ker = s * cin * out_c + p * cin;
-//#ifdef __aarch64__
+            //#ifdef __aarch64__
             wino_sgemm_4x4_A72(out_ptr1 + s * BLOCK_HW_UNIT, inp + offset, ker + offset_ker, cin, 1);
         }
-        for(; i < block_end; i++)
+        for (; i < block_end; i++)
         {
             float* out_ptr1 = out_ptr + i * ELEM_SIZE * KER_COUT_UNIT4;
 
@@ -1128,14 +1129,14 @@ void wino_sgemm_4x4_1(const float* ker, const float* inp, float* output, int cin
         for (i = (block_start & -4); i < (block_end & -4); i += 4)
         {
             float* out_ptr1 = out_ptr + i * ELEM_SIZE + s * BLOCK_HW_UNIT;
-            float* inp_ = (float*)(inp + s * block_hw * cin + i*cin);
+            float* inp_ = (float*)(inp + s * block_hw * cin + i * cin);
             float sum0 = 0;
             float sum1 = 0;
             float sum2 = 0;
             float sum3 = 0;
             for (int k = 0; k < cin; k++)
             {
-                sum0 += inp_[k * 4    ] * ker_[k];
+                sum0 += inp_[k * 4] * ker_[k];
                 sum1 += inp_[k * 4 + 1] * ker_[k];
                 sum2 += inp_[k * 4 + 2] * ker_[k];
                 sum3 += inp_[k * 4 + 3] * ker_[k];
@@ -1145,12 +1146,13 @@ void wino_sgemm_4x4_1(const float* ker, const float* inp, float* output, int cin
             out_ptr1[2] = sum2;
             out_ptr1[3] = sum3;
         }
-        for(; i < block_end; i++)
-		{
+        for (; i < block_end; i++)
+        {
             float* out_ptr1 = out_ptr + i * ELEM_SIZE + s;
-            float* inp_ = (float*)(inp + s * block_hw * cin + i*cin);
+            float* inp_ = (float*)(inp + s * block_hw * cin + i * cin);
             float sum0 = 0;
-            for(int k = 0; k < cin; k++){
+            for (int k = 0; k < cin; k++)
+            {
                 sum0 += inp_[k] * ker_[k];
             }
             out_ptr1[0] = sum0;
@@ -1163,13 +1165,14 @@ static inline void trans_output_p(float* trans_out_ptr,
                                   float* output, float* bias, int bias_term,
                                   int block_h, int block_w, int block_hw,
                                   int out_hw, int out_w, int resi_h, int resi_w,
-                                  int activation,int p,int KER_COUT_UNIT_)
+                                  int activation, int p, int KER_COUT_UNIT_)
 {
     int flag_outw = 1;
-    if(out_w < 16)
+    if (out_w < 16)
         flag_outw = 0;
     int i;
-    for(i=0; i< (block_hw & -BLOCK_HW_UNIT); i+=BLOCK_HW_UNIT){
+    for (i = 0; i < (block_hw & -BLOCK_HW_UNIT); i += BLOCK_HW_UNIT)
+    {
         float* buffer_ptr = trans_out_ptr + i * KER_COUT_UNIT_ * ELEM_SIZE;
         int idx_h[4];
         int idx_w[4];
@@ -1183,59 +1186,73 @@ static inline void trans_output_p(float* trans_out_ptr,
         idx_w[2] = (i + 2) % block_w;
         idx_w[3] = (i + 3) % block_w;
         int wino_out_4_tiles = 0;
-        if(flag_outw){
-            if((idx_h[0] == idx_h[3]) && (idx_h[0] < (block_h - 1)) && (idx_w[3] < (block_w - 1))){
+        if (flag_outw)
+        {
+            if ((idx_h[0] == idx_h[3]) && (idx_h[0] < (block_h - 1)) && (idx_w[3] < (block_w - 1)))
+            {
                 wino_out_4_tiles = 1;
             }
         }
-        if(wino_out_4_tiles == 1){
+        if (wino_out_4_tiles == 1)
+        {
             float* bias_ptr = NULL;
-            for(int pss = 0; pss < KER_COUT_UNIT_; pss++){
+            for (int pss = 0; pss < KER_COUT_UNIT_; pss++)
+            {
                 int cout_idx = p + pss;
                 float* out_ptr = output + cout_idx * out_hw + idx_h[0] * TILE * out_w + idx_w[0] * TILE;
-                if(bias_term){
-                    bias_ptr = ( float* )(bias + cout_idx);
+                if (bias_term)
+                {
+                    bias_ptr = (float*)(bias + cout_idx);
                 }
                 float ker00[4] = {2, 4, 8, 0};
                 tran_out_4(buffer_ptr + pss * ELEM_SIZE * BLOCK_HW_UNIT, out_ptr, out_w * sizeof(float), ker00,
                            bias_ptr, activation);
             }
         }
-        else{
+        else
+        {
             float tmp_buffer[TILE * TILE];
             const float* bias_ptr = NULL;
-            for(int pss = 0; pss < KER_COUT_UNIT_; pss++){
+            for (int pss = 0; pss < KER_COUT_UNIT_; pss++)
+            {
                 int cout_idx = p + pss;
                 float* out_ptr = output + cout_idx * out_hw;
-                if(bias_term){
+                if (bias_term)
+                {
                     bias_ptr = bias + cout_idx;
                 }
                 float buffer[BLOCK_HW_UNIT * ELEM_SIZE];
                 float* buffer_ptr0 = buffer;
                 float* mid_ptr = buffer_ptr + pss * BLOCK_HW_UNIT * ELEM_SIZE;
-                for(int t = 0; t < BLOCK_HW_UNIT; t++){
-                    for(int ss = 0; ss < ELEM_SIZE; ss++){
+                for (int t = 0; t < BLOCK_HW_UNIT; t++)
+                {
+                    for (int ss = 0; ss < ELEM_SIZE; ss++)
+                    {
                         *buffer_ptr0 = mid_ptr[ss * BLOCK_HW_UNIT + t];
                         buffer_ptr0++;
                     }
                 }
-                for(int ii = 0; ii < BLOCK_HW_UNIT; ii++){
+                for (int ii = 0; ii < BLOCK_HW_UNIT; ii++)
+                {
                     int i_h = idx_h[ii];
                     int j_w = idx_w[ii];
-                    if((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) ||
-                       (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1))){
+                    if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
+                    {
                         trans_output_f43(buffer + ii * ELEM_SIZE, out_ptr + (i_h * TILE * out_w + j_w * TILE),
-                                         out_w, ( const float* )bias_ptr, activation);
+                                         out_w, (const float*)bias_ptr, activation);
                     }
-                    else{
+                    else
+                    {
                         int ret_h = TILE - resi_h;
-                        if(i_h < block_h - 1) ret_h = TILE;
+                        if (i_h < block_h - 1) ret_h = TILE;
                         int ret_w = TILE - resi_w;
-                        if(j_w < block_w - 1) ret_w = TILE;
-                        trans_output_f43_ordinary(buffer + ii * ELEM_SIZE, tmp_buffer, ( const float* )bias_ptr);
+                        if (j_w < block_w - 1) ret_w = TILE;
+                        trans_output_f43_ordinary(buffer + ii * ELEM_SIZE, tmp_buffer, (const float*)bias_ptr);
                         float* out_pointer = out_ptr + (i_h * TILE * out_w + j_w * TILE);
-                        for(int hh = 0; hh < ret_h; hh++){
-                            for(int ww = 0; ww < ret_w; ww++){
+                        for (int hh = 0; hh < ret_h; hh++)
+                        {
+                            for (int ww = 0; ww < ret_w; ww++)
+                            {
                                 out_pointer[hh * out_w + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation);
                             }
                         }
@@ -1244,12 +1261,15 @@ static inline void trans_output_p(float* trans_out_ptr,
             }
         }
     }
-    for(; i < block_hw; i++){
+    for (; i < block_hw; i++)
+    {
         float* buffer_ptr = trans_out_ptr + i * KER_COUT_UNIT_ * ELEM_SIZE;
         float resi_buffer[KER_COUT_UNIT_ * ELEM_SIZE];
         float* buffer0 = resi_buffer;
-        for(int pp = 0; pp < KER_COUT_UNIT_; pp++){
-            for(int ss = 0; ss < ELEM_SIZE; ss++){
+        for (int pp = 0; pp < KER_COUT_UNIT_; pp++)
+        {
+            for (int ss = 0; ss < ELEM_SIZE; ss++)
+            {
                 *buffer0 = buffer_ptr[ss * KER_COUT_UNIT_ + pp];
                 buffer0++;
             }
@@ -1259,17 +1279,17 @@ static inline void trans_output_p(float* trans_out_ptr,
     }
 }
 
-
 // transform output
 static inline void trans_output_1(float* trans_out, float* output, float* bias, int bias_term, int block_h, int block_w,
-                                int cout_start, int cout_end, int out_hw, int out_w, int resi_h, int resi_w,
-                                  int activation,int num_thread)
+                                  int cout_start, int cout_end, int out_hw, int out_w, int resi_h, int resi_w,
+                                  int activation, int num_thread)
 {
     int block_hw = block_h * block_w;
     int p;
     //cout 16
 #pragma omp parallel for num_threads(num_thread) shared(block_hw)
-    for(p = cout_start; p < (cout_end& -KER_COUT_UNIT); p+=KER_COUT_UNIT){
+    for (p = cout_start; p < (cout_end & -KER_COUT_UNIT); p += KER_COUT_UNIT)
+    {
         trans_output_p(trans_out + p * block_hw * ELEM_SIZE,
                        output, bias, bias_term,
                        block_h, block_w, block_hw,
@@ -1278,7 +1298,8 @@ static inline void trans_output_1(float* trans_out, float* output, float* bias,
     }
     //cout 4
 #pragma omp parallel for num_threads(num_thread) shared(block_hw)
-    for(p = (cout_end & -KER_COUT_UNIT); p < (cout_end & -KER_COUT_UNIT4); p += KER_COUT_UNIT4){
+    for (p = (cout_end & -KER_COUT_UNIT); p < (cout_end & -KER_COUT_UNIT4); p += KER_COUT_UNIT4)
+    {
         trans_output_p(trans_out + p * block_hw * ELEM_SIZE,
                        output, bias, bias_term,
                        block_h, block_w, block_hw,
@@ -1287,7 +1308,8 @@ static inline void trans_output_1(float* trans_out, float* output, float* bias,
     }
     // cout 1
 #pragma omp parallel for num_threads(num_thread) shared(block_hw)
-    for(p=(cout_end & -KER_COUT_UNIT4); p < cout_end; p ++){
+    for (p = (cout_end & -KER_COUT_UNIT4); p < cout_end; p++)
+    {
         trans_output_p(trans_out + p * block_hw * ELEM_SIZE,
                        output, bias, bias_term,
                        block_h, block_w, block_hw,
@@ -1301,17 +1323,17 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param)
     int output_c = filter->dims[0];
     int input_c = filter->dims[1];
     int trans_ker_size = output_c * input_c * ELEM_SIZE * sizeof(float);
-    return trans_ker_size + 128;    // caution
+    return trans_ker_size + 128; // caution
 }
 
 int wino_conv_hcl_prerun_1(struct tensor* input_tensor, struct tensor* filter_tensor,
-                         struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param)
+                           struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param)
 {
     // fTLOG_ERR(stderr,"run into wino_1 prerun.\n");
     int output_c = filter_tensor->dims[0];
     int input_c = filter_tensor->dims[1];
     int mem_size = get_private_mem_size(filter_tensor, param);
-    float* trans_mem = ( float* )sys_malloc(mem_size);
+    float* trans_mem = (float*)sys_malloc(mem_size);
 
     if (!priv_info->external_interleave_mem)
     {
@@ -1321,7 +1343,7 @@ int wino_conv_hcl_prerun_1(struct tensor* input_tensor, struct tensor* filter_te
     }
 
     transform_kernel_f43_tile(filter_tensor, trans_mem);
-    interleave_kernel_1(trans_mem, ( float* )priv_info->interleave_buffer, output_c, input_c);
+    interleave_kernel_1(trans_mem, (float*)priv_info->interleave_buffer, output_c, input_c);
 
     sys_free(trans_mem);
 
@@ -1329,8 +1351,8 @@ int wino_conv_hcl_prerun_1(struct tensor* input_tensor, struct tensor* filter_te
 }
 
 int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                      struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
-                      int num_thread, int cpu_affinity)
+                        struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
+                        int num_thread, int cpu_affinity)
 {
     int kernel_h = param->kernel_h;
     int kernel_w = param->kernel_w;
@@ -1368,19 +1390,19 @@ int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tenso
     int padded_in_hw = padded_in_h * padded_in_w;
 
     /* buffer addr */
-    float* input_buf = ( float* )input_tensor->data;
-    float* output_buf = ( float* )output_tensor->data;
+    float* input_buf = (float*)input_tensor->data;
+    float* output_buf = (float*)output_tensor->data;
     float* biases_buf = NULL;
     int bias_term = 0;
 
     if (bias_tensor != NULL)
     {
-        biases_buf = ( float* )bias_tensor->data;
+        biases_buf = (float*)bias_tensor->data;
         bias_term = 1;
     }
 
-    float* col_buf = ( float* )priv_info->im2col_buffer;
-    float* interleave_buf = ( float* )priv_info->interleave_buffer;
+    float* col_buf = (float*)priv_info->im2col_buffer;
+    float* interleave_buf = (float*)priv_info->interleave_buffer;
 
     int inp_padded_size = sizeof(float) * (in_c * padded_in_hw + 2);
 
@@ -1393,9 +1415,9 @@ int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tenso
 
     for (int n = 0; n < batch; n++)
     {
-        float* input_padded = ( float* )sys_malloc(inp_padded_size);
-        float* trans_inp = ( float* )sys_malloc(sizeof(float) * ELEM_SIZE * in_c * block_hw + 128);
-        float* trans_out = ( float* )sys_malloc(sizeof(float) * ELEM_SIZE * out_c * block_hw);
+        float* input_padded = (float*)sys_malloc(inp_padded_size);
+        float* trans_inp = (float*)sys_malloc(sizeof(float) * ELEM_SIZE * in_c * block_hw + 128);
+        float* trans_out = (float*)sys_malloc(sizeof(float) * ELEM_SIZE * out_c * block_hw);
 
         float* input = input_buf + n * input_size;
         float* output = output_buf + n * output_size;
@@ -1409,24 +1431,24 @@ int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tenso
         if (resi_block != block_hw)
         {
             tran_input_resi_block_1(input_padded, trans_inp, in_c, nn_block, resi_block, block_hw, block_w,
-                                  padded_in_hw, padded_in_w);
+                                    padded_in_hw, padded_in_w);
         }
         sys_free(input_padded);
 
         /* gemm */
-        for(int s = 0; s < ELEM_SIZE; s++)
+        for (int s = 0; s < ELEM_SIZE; s++)
         {
             wino_sgemm_4x16_1(interleave_buf, trans_inp, trans_out, in_c, nn_out_c, block_h, block_w,
-                            out_c, num_thread, s, cpu_affinity);
+                              out_c, num_thread, s, cpu_affinity);
             if (nn_out_c != out_c)
             {
-               wino_sgemm_4x4_1(interleave_buf, trans_inp, trans_out, in_c, nn_out_c,
-                                 out_c, block_h, block_w, out_c, act_type, s ,num_thread, cpu_affinity);
+                wino_sgemm_4x4_1(interleave_buf, trans_inp, trans_out, in_c, nn_out_c,
+                                 out_c, block_h, block_w, out_c, act_type, s, num_thread, cpu_affinity);
             }
         }
         sys_free(trans_inp);
         trans_output_1(trans_out, output, biases_buf, bias_term, block_h, block_w, 0, out_c, out_hw, out_w, resi_h, resi_w,
-                       act_type,num_thread);
+                       act_type, num_thread);
 
         sys_free(trans_out);
     }
diff --git a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.h b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.h
index 53a45a9ec..b4b3298d0 100644
--- a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.h
+++ b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_1_arm.h
@@ -30,13 +30,11 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int wino_conv_hcl_prerun_1(struct tensor* input_tensor, struct tensor* filter_tensor,
-                         struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param)
-   ;
+                           struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param);
 
 int wino_conv_hcl_run_1(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                      struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param,
-                      int num_thread, int affinity);
+                        struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param,
+                        int num_thread, int affinity);
 
 #endif
diff --git a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.c b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.c
index c6a3b1525..50c2025dd 100644
--- a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.c
+++ b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.c
@@ -34,8 +34,7 @@
 
 #include <arm_neon.h>
 
-
-#define TILE 4
+#define TILE      4
 #define ELEM_SIZE ((TILE + 2) * (TILE + 2))
 
 #define WINO_MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -93,10 +92,10 @@ static inline void trans_kernel_f43(float* ker, float* trans_ker)
     */
     float tmp[18] = {0};
 
-    float neg_r0_add_r2_x_1_6[6];    // (r0+r2)*1./6
-    float r0_1_4_add_r2_x_1_6[6];    // (r0*1/4 + r2)*1./6
-    float r1_1_6[6];    // r1*1/6
-    float r1_1_12[6];    // r1*1/12
+    float neg_r0_add_r2_x_1_6[6]; // (r0+r2)*1./6
+    float r0_1_4_add_r2_x_1_6[6]; // (r0*1/4 + r2)*1./6
+    float r1_1_6[6];              // r1*1/6
+    float r1_1_12[6];             // r1*1/12
     float s_1_6 = 1. / 6.f;
     for (int j = 0; j < 3; j++)
     {
@@ -142,14 +141,14 @@ static inline void transform_kernel_f43_tile(struct tensor* filter, float* trans
 {
     int outc = filter->dims[0];
     int inc = filter->dims[1];
-    float* kernel = ( float* )filter->data;
+    float* kernel = (float*)filter->data;
     float* ker_ptr = trans_ker;
 
     for (int i = 0; i < outc; i++)
     {
         for (int j = 0; j < inc; j++)
         {
-            trans_kernel_f43(( float* )(kernel + 9 * (j + i * inc)), ker_ptr);
+            trans_kernel_f43((float*)(kernel + 9 * (j + i * inc)), ker_ptr);
             ker_ptr += ELEM_SIZE;
         }
     }
@@ -212,7 +211,7 @@ static inline void pad_input1(const float* input, float* inp_padded, int inc, in
     int padded_hw = padded_h * padded_w;
 
     float* pad_ptr;
-    float* inp_ptr = ( float* )input;
+    float* inp_ptr = (float*)input;
     int resi_h = padded_h - pad0 - inh;
     int resi_w = padded_w - pad1 - inw;
     for (int c = 0; c < inc; c++)
@@ -241,7 +240,7 @@ static inline void pad_input1(const float* input, float* inp_padded, int inc, in
 
 static inline void trans_inp_1tile(float* input, float* inp_ptr, int ih, int jw, int c, int in_hw, int inw)
 {
-    float* inp = ( float* )input + c * in_hw + ih * 4 * inw + jw * 4;
+    float* inp = (float*)input + c * in_hw + ih * 4 * inw + jw * 4;
     float* inp0 = inp;
     float* inp1 = inp0 + inw;
     float* inp2 = inp1 + inw;
@@ -383,19 +382,19 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     float32x4_t line0_4 = vld1q_f32(mid + 16);
     float32x4_t line0_5 = vld1q_f32(mid + 20);
 
-    float32x4_t line1_0 = vsubq_f32(r0, r0_);    // mid[(6 + i) * 4 + k]   [1][0]
-    float32x4_t line1_1 = vsubq_f32(r1, r1_);    // mid[(6 + i) * 4 + k]   [1][1]
-    float32x4_t line1_2 = vsubq_f32(r2, r2_);    // mid[(6 + i) * 4 + k]   [1][2]
-    float32x4_t line1_3 = vsubq_f32(r3, r3_);    // mid[(6 + i) * 4 + k]   [1][3]
-    float32x4_t line1_4 = vsubq_f32(r4, r4_);    // mid[(6 + i) * 4 + k]   [1][4]
-    float32x4_t line1_5 = vsubq_f32(r5, r5_);    // mid[(6 + i) * 4 + k]   [1][5]
+    float32x4_t line1_0 = vsubq_f32(r0, r0_); // mid[(6 + i) * 4 + k]   [1][0]
+    float32x4_t line1_1 = vsubq_f32(r1, r1_); // mid[(6 + i) * 4 + k]   [1][1]
+    float32x4_t line1_2 = vsubq_f32(r2, r2_); // mid[(6 + i) * 4 + k]   [1][2]
+    float32x4_t line1_3 = vsubq_f32(r3, r3_); // mid[(6 + i) * 4 + k]   [1][3]
+    float32x4_t line1_4 = vsubq_f32(r4, r4_); // mid[(6 + i) * 4 + k]   [1][4]
+    float32x4_t line1_5 = vsubq_f32(r5, r5_); // mid[(6 + i) * 4 + k]   [1][5]
 
-    float32x4_t line2_0 = vaddq_f32(r0, r0_);    // mid[(12 + i) * 4 + k]  [2][0]
-    float32x4_t line2_1 = vaddq_f32(r1, r1_);    // mid[(12 + i) * 4 + k]  [2][1]
-    float32x4_t line2_2 = vaddq_f32(r2, r2_);    // mid[(12 + i) * 4 + k]  [2][2]
-    float32x4_t line2_3 = vaddq_f32(r3, r3_);    // mid[(12 + i) * 4 + k]  [2][3]
-    float32x4_t line2_4 = vaddq_f32(r4, r4_);    // mid[(12 + i) * 4 + k]  [2][4]
-    float32x4_t line2_5 = vaddq_f32(r5, r5_);    // mid[(12 + i) * 4 + k]  [2][5]
+    float32x4_t line2_0 = vaddq_f32(r0, r0_); // mid[(12 + i) * 4 + k]  [2][0]
+    float32x4_t line2_1 = vaddq_f32(r1, r1_); // mid[(12 + i) * 4 + k]  [2][1]
+    float32x4_t line2_2 = vaddq_f32(r2, r2_); // mid[(12 + i) * 4 + k]  [2][2]
+    float32x4_t line2_3 = vaddq_f32(r3, r3_); // mid[(12 + i) * 4 + k]  [2][3]
+    float32x4_t line2_4 = vaddq_f32(r4, r4_); // mid[(12 + i) * 4 + k]  [2][4]
+    float32x4_t line2_5 = vaddq_f32(r5, r5_); // mid[(12 + i) * 4 + k]  [2][5]
 
     r0 = vld1q_f32(r4_minus_r2);
     r1 = vld1q_f32(r4_minus_r2 + 4);
@@ -418,19 +417,19 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     float32x4_t line5_4 = vld1q_f32(mid + 136);
     float32x4_t line5_5 = vld1q_f32(mid + 140);
 
-    float32x4_t line3_0 = vsubq_f32(r0, r0_);    // mid[(18 + i) * 4 + k]   [3][0]
-    float32x4_t line3_1 = vsubq_f32(r1, r1_);    // mid[(18 + i) * 4 + k]   [3][1]
-    float32x4_t line3_2 = vsubq_f32(r2, r2_);    // mid[(18 + i) * 4 + k]   [3][2]
-    float32x4_t line3_3 = vsubq_f32(r3, r3_);    // mid[(18 + i) * 4 + k]   [3][3]
-    float32x4_t line3_4 = vsubq_f32(r4, r4_);    // mid[(18 + i) * 4 + k]   [3][4]
-    float32x4_t line3_5 = vsubq_f32(r5, r5_);    // mid[(18 + i) * 4 + k]   [3][5]
+    float32x4_t line3_0 = vsubq_f32(r0, r0_); // mid[(18 + i) * 4 + k]   [3][0]
+    float32x4_t line3_1 = vsubq_f32(r1, r1_); // mid[(18 + i) * 4 + k]   [3][1]
+    float32x4_t line3_2 = vsubq_f32(r2, r2_); // mid[(18 + i) * 4 + k]   [3][2]
+    float32x4_t line3_3 = vsubq_f32(r3, r3_); // mid[(18 + i) * 4 + k]   [3][3]
+    float32x4_t line3_4 = vsubq_f32(r4, r4_); // mid[(18 + i) * 4 + k]   [3][4]
+    float32x4_t line3_5 = vsubq_f32(r5, r5_); // mid[(18 + i) * 4 + k]   [3][5]
 
-    float32x4_t line4_0 = vaddq_f32(r0, r0_);    // mid[(24 + i) * 4 + k]  [4][0]
-    float32x4_t line4_1 = vaddq_f32(r1, r1_);    // mid[(24 + i) * 4 + k]  [4][1]
-    float32x4_t line4_2 = vaddq_f32(r2, r2_);    // mid[(24 + i) * 4 + k]  [4][2]
-    float32x4_t line4_3 = vaddq_f32(r3, r3_);    // mid[(24 + i) * 4 + k]  [4][3]
-    float32x4_t line4_4 = vaddq_f32(r4, r4_);    // mid[(24 + i) * 4 + k]  [4][4]
-    float32x4_t line4_5 = vaddq_f32(r5, r5_);    // mid[(24 + i) * 4 + k]  [4][5]
+    float32x4_t line4_0 = vaddq_f32(r0, r0_); // mid[(24 + i) * 4 + k]  [4][0]
+    float32x4_t line4_1 = vaddq_f32(r1, r1_); // mid[(24 + i) * 4 + k]  [4][1]
+    float32x4_t line4_2 = vaddq_f32(r2, r2_); // mid[(24 + i) * 4 + k]  [4][2]
+    float32x4_t line4_3 = vaddq_f32(r3, r3_); // mid[(24 + i) * 4 + k]  [4][3]
+    float32x4_t line4_4 = vaddq_f32(r4, r4_); // mid[(24 + i) * 4 + k]  [4][4]
+    float32x4_t line4_5 = vaddq_f32(r5, r5_); // mid[(24 + i) * 4 + k]  [4][5]
 
     // r4_minus_r2[i * 4 + k]   i=0     = mid[0][4]
     r0 = vsubq_f32(line0_4, line0_2);
@@ -455,30 +454,30 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     r4_ = vmulq_f32(r4_, const2);
     r5_ = vmulq_f32(r5_, const2);
 
-    vst1q_f32(inp_ptr + s_size * 3, vsubq_f32(r0, r0_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 9, vsubq_f32(r1, r1_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 15, vsubq_f32(r2, r2_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 21, vsubq_f32(r3, r3_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 27, vsubq_f32(r4, r4_));    // inp_ptr[ s_size * (3 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 33, vsubq_f32(r5, r5_));    // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 3, vsubq_f32(r0, r0_));  // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 9, vsubq_f32(r1, r1_));  // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 15, vsubq_f32(r2, r2_)); // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 21, vsubq_f32(r3, r3_)); // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 27, vsubq_f32(r4, r4_)); // inp_ptr[ s_size * (3 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 33, vsubq_f32(r5, r5_)); // inp_ptr[ s_size * (3 + i * 6)]
 
-    vst1q_f32(inp_ptr + s_size * 4, vaddq_f32(r0, r0_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 10, vaddq_f32(r1, r1_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 16, vaddq_f32(r2, r2_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 22, vaddq_f32(r3, r3_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 28, vaddq_f32(r4, r4_));    // inp_ptr[ s_size * (4 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 34, vaddq_f32(r5, r5_));    // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 4, vaddq_f32(r0, r0_));  // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 10, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 16, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 22, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 28, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (4 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 34, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (4 + i * 6)]
 
     float32x4_t const4 = vdupq_n_f32(4.f);
     float32x4_t const5 = vdupq_n_f32(-5.f);
-    r0_ = vmulq_f32(line0_1, const4);    // line 1*4 ========
+    r0_ = vmulq_f32(line0_1, const4); // line 1*4 ========
     r1_ = vmulq_f32(line1_1, const4);
     r2_ = vmulq_f32(line2_1, const4);
     r3_ = vmulq_f32(line3_1, const4);
     r4_ = vmulq_f32(line4_1, const4);
     r5_ = vmulq_f32(line5_1, const4);
 
-    float32x4_t rr0_ = vsubq_f32(r0_, line0_3);    // line1*4-line3
+    float32x4_t rr0_ = vsubq_f32(r0_, line0_3); // line1*4-line3
     float32x4_t rr1_ = vsubq_f32(r1_, line1_3);
     float32x4_t rr2_ = vsubq_f32(r2_, line2_3);
     float32x4_t rr3_ = vsubq_f32(r3_, line3_3);
@@ -492,28 +491,28 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     r4 = vmulq_f32(line4_2, const4);
     r5 = vmulq_f32(line5_2, const4);
 
-    r0 = vsubq_f32(line0_4, r0);    // line4 -4*line2
+    r0 = vsubq_f32(line0_4, r0); // line4 -4*line2
     r1 = vsubq_f32(line1_4, r1);
     r2 = vsubq_f32(line2_4, r2);
     r3 = vsubq_f32(line3_4, r3);
     r4 = vsubq_f32(line4_4, r4);
     r5 = vsubq_f32(line5_4, r5);
 
-    vst1q_f32(inp_ptr + s_size * 1, vsubq_f32(r0, rr0_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 7, vsubq_f32(r1, rr1_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 13, vsubq_f32(r2, rr2_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 19, vsubq_f32(r3, rr3_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 25, vsubq_f32(r4, rr4_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 31, vsubq_f32(r5, rr5_));    // inp_ptr[ s_size * (1 + i * 6)]
-
-    vst1q_f32(inp_ptr + s_size * 2, vaddq_f32(r0, rr0_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 8, vaddq_f32(r1, rr1_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 14, vaddq_f32(r2, rr2_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 20, vaddq_f32(r3, rr3_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 26, vaddq_f32(r4, rr4_));    // inp_ptr[ s_size * (2 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 32, vaddq_f32(r5, rr5_));    // inp_ptr[ s_size * (2 + i * 6)]
-
-    r0_ = vaddq_f32(line0_5, r0_);    // 5 + 1*4
+    vst1q_f32(inp_ptr + s_size * 1, vsubq_f32(r0, rr0_));  // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 7, vsubq_f32(r1, rr1_));  // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 13, vsubq_f32(r2, rr2_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 19, vsubq_f32(r3, rr3_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 25, vsubq_f32(r4, rr4_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 31, vsubq_f32(r5, rr5_)); // inp_ptr[ s_size * (1 + i * 6)]
+
+    vst1q_f32(inp_ptr + s_size * 2, vaddq_f32(r0, rr0_));  // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 8, vaddq_f32(r1, rr1_));  // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 14, vaddq_f32(r2, rr2_)); // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 20, vaddq_f32(r3, rr3_)); // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 26, vaddq_f32(r4, rr4_)); // inp_ptr[ s_size * (2 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 32, vaddq_f32(r5, rr5_)); // inp_ptr[ s_size * (2 + i * 6)]
+
+    r0_ = vaddq_f32(line0_5, r0_); // 5 + 1*4
     r1_ = vaddq_f32(line1_5, r1_);
     r2_ = vaddq_f32(line2_5, r2_);
     r3_ = vaddq_f32(line3_5, r3_);
@@ -526,12 +525,12 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     r3 = vmulq_f32(line3_3, const5);
     r4 = vmulq_f32(line4_3, const5);
     r5 = vmulq_f32(line5_3, const5);
-    vst1q_f32(inp_ptr + s_size * 5, vaddq_f32(r0, r0_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 11, vaddq_f32(r1, r1_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 17, vaddq_f32(r2, r2_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 23, vaddq_f32(r3, r3_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 29, vaddq_f32(r4, r4_));    // inp_ptr[ s_size * (5 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 35, vaddq_f32(r5, r5_));    // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 5, vaddq_f32(r0, r0_));  // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 11, vaddq_f32(r1, r1_)); // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 17, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 23, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 29, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (5 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 35, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (5 + i * 6)]
 
     r0 = vmulq_f32(line0_0, const4);
     r1 = vmulq_f32(line1_0, const4);
@@ -554,12 +553,12 @@ static inline void trans_inp_4_cpu(float* inp, float* inp_ptr, int inw, int s_si
     r4 = vaddq_f32(r4, line4_4);
     r5 = vaddq_f32(r5, line5_4);
 
-    vst1q_f32(inp_ptr + s_size * 0, vaddq_f32(r0, r0_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 6, vaddq_f32(r1, r1_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 12, vaddq_f32(r2, r2_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 18, vaddq_f32(r3, r3_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 24, vaddq_f32(r4, r4_));    // inp_ptr[ s_size * (1 + i * 6)]
-    vst1q_f32(inp_ptr + s_size * 30, vaddq_f32(r5, r5_));    // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 0, vaddq_f32(r0, r0_));  // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 6, vaddq_f32(r1, r1_));  // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 12, vaddq_f32(r2, r2_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 18, vaddq_f32(r3, r3_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 24, vaddq_f32(r4, r4_)); // inp_ptr[ s_size * (1 + i * 6)]
+    vst1q_f32(inp_ptr + s_size * 30, vaddq_f32(r5, r5_)); // inp_ptr[ s_size * (1 + i * 6)]
 
     // for(int i = 0; i < 6; i++)
     // {
@@ -599,7 +598,7 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i
     int idxh[4];
     int idxw[4];
 
-#pragma omp parallel for num_threads(num_thread) shared(block_hw,nn_block,in_hw) private(idxh,idxw)
+#pragma omp parallel for num_threads(num_thread) shared(block_hw, nn_block, in_hw) private(idxh, idxw)
     for (int ib = 0; ib < nn_block; ib++)
     {
         float* inp_ptr_4tile = trans_inp + ib * 4 * ELEM_SIZE * inc;
@@ -614,7 +613,7 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i
 
         if (idxh[0] == idxh[3])
         {
-            float* temp_inp_ptr = ( float* )(input + idxh[0] * 4 * inw + idxw[0] * 4);
+            float* temp_inp_ptr = (float*)(input + idxh[0] * 4 * inw + idxw[0] * 4);
             for (int c = 0; c < inc; c++)
             {
 #ifdef __aarch64__
@@ -634,13 +633,13 @@ static inline void tran_input_4block(const float* input, float* trans_inp, int i
 
             for (int c = 0; c < inc; c++)
             {
-                trans_inp_1tile(( float* )input, buffer, idxh[0], idxw[0], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[0], idxw[0], c, in_hw, inw);
                 buffer += ELEM_SIZE;
-                trans_inp_1tile(( float* )input, buffer, idxh[1], idxw[1], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[1], idxw[1], c, in_hw, inw);
                 buffer += ELEM_SIZE;
-                trans_inp_1tile(( float* )input, buffer, idxh[2], idxw[2], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[2], idxw[2], c, in_hw, inw);
                 buffer += ELEM_SIZE;
-                trans_inp_1tile(( float* )input, buffer, idxh[3], idxw[3], c, in_hw, inw);
+                trans_inp_1tile((float*)input, buffer, idxh[3], idxw[3], c, in_hw, inw);
                 buffer += ELEM_SIZE;
             }
             // interleave
@@ -673,7 +672,7 @@ static inline void tran_input_resi_block(const float* input, float* trans_inp, i
         {
             int ih = ib / block_w;
             int jw = ib % block_w;
-            trans_inp_1tile(( float* )input, buffer, ih, jw, c, in_hw, inw);
+            trans_inp_1tile((float*)input, buffer, ih, jw, c, in_hw, inw);
             buffer += ELEM_SIZE;
         }
         // interleave
@@ -900,8 +899,7 @@ static inline void transform_output_f43_1tile(const float* buffer_ptr, float* ou
         float* out_ptr = out + cout_idx * out_hw;
         int i_h = idx_blockhw / block_w;
         int j_w = idx_blockhw % block_w;
-        if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) ||
-            (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
+        if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
         {
             trans_output_f43(buffer_ptr, out_ptr + (i_h * TILE * outw + j_w * TILE), outw, bias_ptr, activation);
         }
@@ -955,17 +953,16 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int
         float* out_ptr = out + cout_idx * out_hw;
         if (bias)
         {
-            bias_ptr = ( float* )bias + cout_idx;
+            bias_ptr = (float*)bias + cout_idx;
         }
         for (int ii = 0; ii < 4; ii++)
         {
             int i_h = idx_h[ii];
             int j_w = idx_w[ii];
-            if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) ||
-                (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
+            if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
             {
                 trans_output_f43(buffer_ptr, out_ptr + (i_h * TILE * outw + j_w * TILE), outw, bias_ptr, activation);
-            }    // direct use_out_ptr
+            } // direct use_out_ptr
             else
             {
                 int ret_h = TILE - resi_h;
@@ -985,7 +982,7 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int
                         out_pointer[hh * outw + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation);
                     }
                 }
-            }    // end else, tmp_buff
+            } // end else, tmp_buff
             buffer_ptr += ELEM_SIZE;
         }
     }
@@ -1048,7 +1045,7 @@ static void wino_sgemm_set(const float* ker, const float* inp, float* output, co
                     float* out_ptr = output + cout_idx * out_hw + idx_h[0] * TILE * out_w + idx_w[0] * TILE;
                     if (bias)
                     {
-                        bias_ptr = ( float* )(bias + cout_idx);
+                        bias_ptr = (float*)(bias + cout_idx);
                     }
 
                     float ker00[4] = {2, 4, 8, 0};
@@ -1086,11 +1083,10 @@ static void wino_sgemm_set(const float* ker, const float* inp, float* output, co
                         {
                             int i_h = idx_h[ii];
                             int j_w = idx_w[ii];
-                            if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) ||
-                                (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
+                            if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
                             {
                                 trans_output_f43(buffer + ii * ELEM_SIZE + pss * 36 * 4, out_ptr + (i_h * TILE * out_w + j_w * TILE), out_w, (const float*)bias_ptr, activation);
-                            }    // direct use_out_ptr
+                            } // direct use_out_ptr
                             else
                             {
                                 int ret_h = TILE - resi_h;
@@ -1109,7 +1105,7 @@ static void wino_sgemm_set(const float* ker, const float* inp, float* output, co
                                         out_pointer[hh * out_w + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation);
                                     }
                                 }
-                            }    // end else, tmp_buff
+                            } // end else, tmp_buff
                         }
                     }
                 }
@@ -1163,7 +1159,7 @@ static void wino_sgemm_set(const float* ker, const float* inp, float* output, co
                 }
             }
             // end interleave
-            transform_output_f43_1tile(( const float* )buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h,
+            transform_output_f43_1tile((const float*)buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h,
                                        resi_w, PER_OUT_CHAN, bias, activation);
             // end transform
         }
@@ -1177,7 +1173,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
     int flag_outw = 1;
     if (out_w < 16)
         flag_outw = 0;
-    
+
 #pragma omp parallel for num_threads(num_thread)
     for (int p = (cout_start & -4); p < (cout_end & -4); p += 4)
     {
@@ -1226,7 +1222,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
                     float* out_ptr = output + cout_idx * out_hw + idx_h[0] * TILE * out_w + idx_w[0] * TILE;
                     if (bias)
                     {
-                        bias_ptr = ( float* )(bias + cout_idx);
+                        bias_ptr = (float*)(bias + cout_idx);
                     }
                     float ker00[4] = {2, 4, 8, 0};
 
@@ -1268,13 +1264,12 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
                         {
                             int i_h = idx_h[ii];
                             int j_w = idx_w[ii];
-                            if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) ||
-                                (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
+                            if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
                             {
                                 trans_output_f43(buffer + ii * ELEM_SIZE + pss * 36 * 4,
                                                  out_ptr + (i_h * TILE * out_w + j_w * TILE), out_w,
-                                                 ( const float* )bias_ptr, activation);
-                            }    // direct use_out_ptr
+                                                 (const float*)bias_ptr, activation);
+                            } // direct use_out_ptr
                             else
                             {
                                 int ret_h = TILE - resi_h;
@@ -1285,18 +1280,17 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
                                     ret_w = TILE;
                                 // tmp_buffer
                                 trans_output_f43_ordinary(buffer + ii * ELEM_SIZE + pss * 36 * 4, tmp_buffer,
-                                                          ( const float* )bias_ptr);
+                                                          (const float*)bias_ptr);
                                 float* out_pointer = out_ptr + (i_h * TILE * out_w + j_w * TILE);
                                 for (int hh = 0; hh < ret_h; hh++)
                                 {
                                     for (int ww = 0; ww < ret_w; ww++)
                                     {
-                                        out_pointer[hh * out_w + ww] =
-                                            do_activation(tmp_buffer[hh * 4 + ww], activation);
+                                        out_pointer[hh * out_w + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation);
                                     }
                                 }
 
-                            }    // end else, tmp_buff
+                            } // end else, tmp_buff
                         }
                     }
                 }
@@ -1353,7 +1347,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
                 }
             }
             // end interleave
-            transform_output_f43_1tile(( const float* )buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h,
+            transform_output_f43_1tile((const float*)buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h,
                                        resi_w, 4, bias, activation);
             // end transform
         }
@@ -1384,8 +1378,8 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
             // gemm+interleave buffer[4][36]
             for (int s = 0; s < ELEM_SIZE; s++)
             {
-                float* inp_ = ( float* )(inp_ptr + s * 4 * cin);
-                float* ker_ = ( float* )(ker_ptr + s * cin);
+                float* inp_ = (float*)(inp_ptr + s * 4 * cin);
+                float* ker_ = (float*)(ker_ptr + s * cin);
 
                 float sum0 = 0;
                 float sum1 = 0;
@@ -1415,12 +1409,11 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
             {
                 int i_h = idx_h[ii];
                 int j_w = idx_w[ii];
-                if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) ||
-                    (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
+                if ((resi_h == 0 && resi_w == 0) || (resi_h == 0 && (j_w < block_w - 1)) || (resi_w == 0 && (i_h < block_h - 1)) || ((j_w < block_w - 1) && (i_h < block_h - 1)))
                 {
                     trans_output_f43(buffer + ii * ELEM_SIZE, out_ptr + (i_h * TILE * out_w + j_w * TILE), out_w,
-                                     ( const float* )bias_ptr, activation);
-                }    // direct use_out_ptr
+                                     (const float*)bias_ptr, activation);
+                } // direct use_out_ptr
                 else
                 {
                     int ret_h = TILE - resi_h;
@@ -1430,7 +1423,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
                     if (j_w < block_w - 1)
                         ret_w = TILE;
                     // tmp_buffer
-                    trans_output_f43_ordinary(buffer + ii * ELEM_SIZE, tmp_buffer, ( const float* )bias_ptr);
+                    trans_output_f43_ordinary(buffer + ii * ELEM_SIZE, tmp_buffer, (const float*)bias_ptr);
                     float* out_pointer = out_ptr + (i_h * TILE * out_w + j_w * TILE);
                     for (int hh = 0; hh < ret_h; hh++)
                     {
@@ -1439,8 +1432,8 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
                             out_pointer[hh * out_w + ww] = do_activation(tmp_buffer[hh * 4 + ww], activation);
                         }
                     }
-                }    // end else, tmp_buff
-            }    // end transform
+                } // end else, tmp_buff
+            }     // end transform
         }
 
         for (; i < block_hw; i++)
@@ -1450,8 +1443,8 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
             float buffer[ELEM_SIZE];
             for (int s = 0; s < ELEM_SIZE; s++)
             {
-                float* inp_ = ( float* )(inp_ptr + s * cin);
-                float* ker_ = ( float* )(ker_ptr + s * cin);
+                float* inp_ = (float*)(inp_ptr + s * cin);
+                float* ker_ = (float*)(ker_ptr + s * cin);
 
                 float sum = 0;
                 for (int k = 0; k < cin; k++)
@@ -1461,7 +1454,7 @@ void wino_sgemm_4x4(const float* ker, const float* inp, float* output, const flo
                 buffer[s] = sum;
             }
             // end interleave
-            transform_output_f43_1tile(( const float* )buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h,
+            transform_output_f43_1tile((const float*)buffer, output, p, i, block_h, block_w, out_hw, out_w, resi_h,
                                        resi_w, 1, bias, activation);
             // end transform
         }
@@ -1473,7 +1466,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param)
     int output_c = filter->dims[0];
     int input_c = filter->dims[1];
     int trans_ker_size = output_c * input_c * ELEM_SIZE * sizeof(float);
-    return trans_ker_size + 128;    // caution
+    return trans_ker_size + 128; // caution
 }
 
 int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor,
@@ -1482,7 +1475,7 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
     int output_c = filter_tensor->dims[0];
     int input_c = filter_tensor->dims[1];
     int mem_size = get_private_mem_size(filter_tensor, param);
-    float* trans_mem = ( float* )sys_malloc(mem_size);
+    float* trans_mem = (float*)sys_malloc(mem_size);
 
     if (!priv_info->external_interleave_mem)
     {
@@ -1492,7 +1485,7 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
     }
 
     transform_kernel_f43_tile(filter_tensor, trans_mem);
-    interleave_kernel(trans_mem, ( float* )priv_info->interleave_buffer, output_c, input_c);
+    interleave_kernel(trans_mem, (float*)priv_info->interleave_buffer, output_c, input_c);
 
     sys_free(trans_mem);
 
@@ -1548,16 +1541,16 @@ int wino_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor,
     int padded_in_hw = padded_in_h * padded_in_w;
 
     /* buffer addr */
-    float* input_buf = ( float* )input_tensor->data;
-    float* output_buf = ( float* )output_tensor->data;
+    float* input_buf = (float*)input_tensor->data;
+    float* output_buf = (float*)output_tensor->data;
     float* biases_buf = NULL;
     if (bias_tensor != NULL)
-        biases_buf = ( float* )bias_tensor->data;
-    float* col_buf = ( float* )priv_info->im2col_buffer;
-    float* interleave_buf = ( float* )priv_info->interleave_buffer;
+        biases_buf = (float*)bias_tensor->data;
+    float* col_buf = (float*)priv_info->im2col_buffer;
+    float* interleave_buf = (float*)priv_info->interleave_buffer;
 
-    float* input_padd_buf = ( float* )sys_malloc(sizeof(float) * padded_in_hw * in_c + 128);
-    float* trans_input_buf = ( float* )sys_malloc(sizeof(float) * block_hw * in_c * ELEM_SIZE + 128);
+    float* input_padd_buf = (float*)sys_malloc(sizeof(float) * padded_in_hw * in_c + 128);
+    float* trans_input_buf = (float*)sys_malloc(sizeof(float) * block_hw * in_c * ELEM_SIZE + 128);
 
     int nn_out_c = out_c / PER_OUT_CHAN * PER_OUT_CHAN;
 
diff --git a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.h b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.h
index 5f6685528..cac8b75b1 100644
--- a/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.h
+++ b/source/device/cpu/op/conv/cortex-a/wino_conv_kernel_arm.h
@@ -30,10 +30,8 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor,
-                         struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param)
-   ;
+                         struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param);
 
 int wino_conv_hcl_postrun(struct conv_priv_info* info);
 
diff --git a/source/device/cpu/op/conv/cortex-m/conv_cmsis.c b/source/device/cpu/op/conv/cortex-m/conv_cmsis.c
index ecbe6c4ca..f9057f0b6 100644
--- a/source/device/cpu/op/conv/cortex-m/conv_cmsis.c
+++ b/source/device/cpu/op/conv/cortex-m/conv_cmsis.c
@@ -35,7 +35,6 @@
 
 #include "arm_math.h"
 
-
 struct cmsis_param
 {
     uint16_t bias_shift;
@@ -78,7 +77,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
     int scale = ir_tensor->scale;
     out_shift = cal_shift(scale);
 
-    struct cmsis_param* param = ( struct cmsis_param* )sys_malloc(sizeof(struct cmsis_param));
+    struct cmsis_param* param = (struct cmsis_param*)sys_malloc(sizeof(struct cmsis_param));
 
     param->bias_shift = bias_shift;
     param->out_shift = out_shift;
@@ -86,9 +85,8 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
     exec_node->ops_priv = param;
 
     /*2*ch_im_in*dim_kernel*dim_kernel */
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    exec_node->shared_mem_size =
-            sizeof(q15_t) * 2 * conv_param->input_channel * conv_param->kernel_h * conv_param->kernel_w;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    exec_node->shared_mem_size = sizeof(q15_t) * 2 * conv_param->input_channel * conv_param->kernel_h * conv_param->kernel_w;
 
     return 0;
 }
@@ -103,18 +101,18 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct cmsis_param* cmsis_param = ( struct cmsis_param* )exec_node->ops_priv;
+    struct cmsis_param* cmsis_param = (struct cmsis_param*)exec_node->ops_priv;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     int ret = arm_convolve_HWC_q7_nonsquare(
-            input_tensor->data, input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[3], weight_tensor->data,
-            weight_tensor->dims[3], conv_param->kernel_w, conv_param->kernel_h, conv_param->pad_w0, conv_param->pad_h0,
-            conv_param->stride_w, conv_param->stride_h, bias_tensor->data, cmsis_param->bias_shift, cmsis_param->out_shift,
-            output_tensor->data, output_tensor->dims[2], output_tensor->dims[1], exec_graph->shared_mem, NULL);
+        input_tensor->data, input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[3], weight_tensor->data,
+        weight_tensor->dims[3], conv_param->kernel_w, conv_param->kernel_h, conv_param->pad_w0, conv_param->pad_h0,
+        conv_param->stride_w, conv_param->stride_h, bias_tensor->data, cmsis_param->bias_shift, cmsis_param->out_shift,
+        output_tensor->data, output_tensor->dims[2], output_tensor->dims[1], exec_graph->shared_mem, NULL);
 
     if (ret != ARM_MATH_SUCCESS)
     {
@@ -137,12 +135,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops cmsis_node_ops = {.prerun = NULL,
-        .run = run,
-        .reshape = reshape,
-        .postrun = NULL,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score};
+                                         .run = run,
+                                         .reshape = reshape,
+                                         .postrun = NULL,
+                                         .init_node = init_node,
+                                         .release_node = release_node,
+                                         .score = score};
 
 int register_conv_cmsis_op()
 {
diff --git a/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c b/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c
index 68de53bb4..095dc59f8 100644
--- a/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c
+++ b/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c
@@ -35,7 +35,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -54,8 +53,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     if (conv_dw_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, cpu_affinity) < 0)
     {
@@ -79,7 +78,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
-    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
     struct node* ir_node = exec_node;
     struct graph* ir_graph = ir_node->graph;
 
@@ -108,8 +107,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     if (input_tensor->data_type != TENGINE_DT_FP32)
         return 0;
 
-    if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 &&
-        ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
+    if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
         return OPS_SCORE_BEST;
     else
         return 0;
diff --git a/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.c b/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.c
index 9e14cb452..7eafe03af 100644
--- a/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.c
+++ b/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.c
@@ -28,19 +28,18 @@
 #include <stdlib.h>
 #include <math.h>
 
-
 #define max(a, b) ((a) > (b) ? (a) : (b))
 #define min(a, b) ((a) < (b) ? (a) : (b))
 
 void relu(float* data, int size, int activation)
 {
-    for(int i = 0; i < size; i++)
+    for (int i = 0; i < size; i++)
     {
-        data[i] = max(data[i], ( float )0);
+        data[i] = max(data[i], (float)0);
 
-        if(activation > 0)
+        if (activation > 0)
         {
-            data[i] = min(data[i], ( float )activation);
+            data[i] = min(data[i], (float)activation);
         }
     }
 }
@@ -58,32 +57,32 @@ void convdw3x3s1(float* output, float* input, float* _kernel, float* _bias, int
     const int group = channel;
     const float* kernel = _kernel;
 
-    #pragma omp parallel for num_threads(num_thread)
-    for (int g=0; g<group; g++)
+#pragma omp parallel for num_threads(num_thread)
+    for (int g = 0; g < group; g++)
     {
         float* out = output + g * c_step_out;
         float* outptr = out;
         float* outptr2 = outptr + outw;
 
         const float bias0 = _bias ? _bias[g] : 0.f;
-        const float* kernel0 = kernel + g*9;
+        const float* kernel0 = kernel + g * 9;
 
         const float* img0 = input + g * c_step_in;
         const float* r0 = img0;
         const float* r1 = img0 + w;
-        const float* r2 = img0 + w*2;
-        const float* r3 = img0 + w*3;
+        const float* r2 = img0 + w * 2;
+        const float* r3 = img0 + w * 3;
 
         const float* k0 = kernel0;
         const float* k1 = kernel0 + 3;
         const float* k2 = kernel0 + 6;
 
         int i = 0;
-        for (; i+1 < outh; i+=2)
+        for (; i + 1 < outh; i += 2)
         {
             int remain = outw;
 
-            for (; remain>0; remain--)
+            for (; remain > 0; remain--)
             {
                 float sum = bias0;
                 sum += r0[0] * k0[0];
@@ -131,7 +130,7 @@ void convdw3x3s1(float* output, float* input, float* _kernel, float* _bias, int
         {
             int remain = outw;
 
-            for (; remain>0; remain--)
+            for (; remain > 0; remain--)
             {
                 float sum = bias0;
                 sum += r0[0] * k0[0];
@@ -171,22 +170,22 @@ void convdw3x3s2(float* output, float* input, float* _kernel, float* _bias, int
 
     const int group = channel;
 
-    const int tailstep = w - 2*outw + w;
+    const int tailstep = w - 2 * outw + w;
     const float* kernel = _kernel;
 
-    #pragma omp parallel for num_threads(num_thread)
-    for (int g=0; g<group; g++)
+#pragma omp parallel for num_threads(num_thread)
+    for (int g = 0; g < group; g++)
     {
         float* out = output + g * c_step_out;
         float* outptr = out;
 
-        const float* kernel0 = kernel + g*9;
+        const float* kernel0 = kernel + g * 9;
         const float bias0 = _bias ? _bias[g] : 0.f;
 
         const float* img0 = input + g * c_step_in;
         const float* r0 = img0;
         const float* r1 = img0 + w;
-        const float* r2 = img0 + w*2;
+        const float* r2 = img0 + w * 2;
 
         const float* k0 = kernel0;
         const float* k1 = kernel0 + 3;
@@ -196,7 +195,7 @@ void convdw3x3s2(float* output, float* input, float* _kernel, float* _bias, int
         for (; i < outh; i++)
         {
             int remain = outw;
-            for (; remain>0; remain--)
+            for (; remain > 0; remain--)
             {
                 float sum = bias0;
                 sum += r0[0] * k0[0];
@@ -221,7 +220,7 @@ void convdw3x3s2(float* output, float* input, float* _kernel, float* _bias, int
             r1 += tailstep;
             r2 += tailstep;
         }
-    }    
+    }
 }
 
 void pad(float* input, float* output, int in_h, int in_w, int out_h, int out_w, int top, int left, float v)
@@ -282,9 +281,9 @@ void pad(float* input, float* output, int in_h, int in_w, int out_h, int out_w,
 int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor,
                 struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity)
 {
-    float* input  = ( float* )input_tensor->data;
-    float* output = ( float* )output_tensor->data;
-    float* kernel = ( float* )weight_tensor->data;
+    float* input = (float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
+    float* kernel = (float*)weight_tensor->data;
     float* biases = NULL;
     if (bias_tensor)
         biases = (float*)bias_tensor->data;
@@ -298,7 +297,7 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc
     int outc = output_tensor->dims[1];
     int outh = output_tensor->dims[2];
     int outw = output_tensor->dims[3];
-    int out_hw  = outh * outw;
+    int out_hw = outh * outw;
     int out_chw = out_hw * outc;
 
     int ksize_h = param->kernel_h;
@@ -323,16 +322,16 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc
     else
     {
         input_tmp = (float*)malloc(inh_tmp * inw_tmp * group * sizeof(float));
-        for (int g=0; g<group; g++)
+        for (int g = 0; g < group; g++)
         {
-            float* pad_in  = input + g * inh * inw;
+            float* pad_in = input + g * inh * inw;
             float* pad_out = input_tmp + g * inh_tmp * inw_tmp;
             pad(pad_in, pad_out, inh, inw, inh_tmp, inw_tmp, pad_h, pad_w, 0.f);
         }
     }
 
     /* process */
-    for(int i = 0; i < batch_number; i++)
+    for (int i = 0; i < batch_number; i++)
     {
         if (stride_h == 1)
             convdw3x3s1(output, input_tmp, kernel, biases, group, inh_tmp, inw_tmp, outh, outw, num_thread);
diff --git a/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.h b/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.h
index 25beb2930..ebea043b9 100644
--- a/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.h
+++ b/source/device/cpu/op/conv/mips/conv_dw_kernel_mips.h
@@ -30,9 +30,7 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity)
-   ;
+                struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity);
 
 #endif
diff --git a/source/device/cpu/op/conv/mips/conv_hcl_mips.c b/source/device/cpu/op/conv/mips/conv_hcl_mips.c
index 581b5841b..baa067b77 100644
--- a/source/device/cpu/op/conv/mips/conv_hcl_mips.c
+++ b/source/device/cpu/op/conv/mips/conv_hcl_mips.c
@@ -37,7 +37,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -46,8 +45,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* get cpu affinity */
     conv_priv_info->cpu_type = exec_graph->cpu_affinity;
@@ -67,7 +66,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
         if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size)
         {
             if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem,
-                                              exec_graph->shared_pack4_mem_size) < 0)
+                                              exec_graph->shared_pack4_mem_size)
+                < 0)
             {
                 TLOG_ERR("hcl conv: set shared pack4 memory failed\n");
                 // set_tengine_errno(EFAULT);
@@ -119,14 +119,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (ir_node->input_num > 2)
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 run */
     if (exec_graph->mode == TENGINE_MODE_FP32)
     {
         if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread,
-                         cpu_affinity) < 0)
+                         cpu_affinity)
+            < 0)
         {
             TLOG_ERR("hcl conv run failed\n");
             // set_tengine_errno(EFAULT);
@@ -149,7 +150,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 postrun */
     if (exec_graph->mode == TENGINE_MODE_FP32)
@@ -182,10 +183,10 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
     filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     /* init the private info data of convolution op */
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info));
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info));
     if (conv_priv_info == NULL)
     {
         // set_tengine_errno(ENOMEM);
@@ -211,7 +212,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
 
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
     sys_free(conv_priv_info);
     exec_node->ops_priv = NULL;
 
@@ -224,7 +225,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
     int group = param->group;
     int kernel_h = param->kernel_h;
     int kernel_w = param->kernel_w;
@@ -246,8 +247,7 @@ static struct node_ops hcl_node_ops = {.prerun = prerun,
                                        .postrun = postrun,
                                        .init_node = init_node,
                                        .release_node = release_node,
-                                       .score = score
-};
+                                       .score = score};
 
 int register_conv_hcl_mips_op()
 {
diff --git a/source/device/cpu/op/conv/mips/conv_kernel_mips.c b/source/device/cpu/op/conv/mips/conv_kernel_mips.c
index 9bd9d4b69..b66994ec3 100644
--- a/source/device/cpu/op/conv/mips/conv_kernel_mips.c
+++ b/source/device/cpu/op/conv/mips/conv_kernel_mips.c
@@ -41,7 +41,6 @@
 #include <stdlib.h>
 #include <math.h>
 
-
 #if __mips_msa
 #include <msa.h>
 #endif
@@ -50,7 +49,7 @@
 
 static int get_private_mem_size(struct tensor* filter)
 {
-    return filter->elem_num * filter->elem_size;    // caution
+    return filter->elem_num * filter->elem_size; // caution
 }
 
 static void interleave(struct tensor* filter, struct conv_priv_info* priv_info)
@@ -126,8 +125,8 @@ void input_pack4(int K, int N, float* pB, float* pB_t, int num_thread)
     int nn_size = N >> 2;
     int remian_size_start = nn_size << 2;
 
-    // [ch00, ch10, ch20, ch30, ch01, ch11, ch21, ch31, ch02, ch12, ch22, ch32, ch03, ch13, ch23, ch33 ....]
-    #pragma omp parallel for num_threads(num_thread)
+// [ch00, ch10, ch20, ch30, ch01, ch11, ch21, ch31, ch02, ch12, ch22, ch32, ch03, ch13, ch23, ch33 ....]
+#pragma omp parallel for num_threads(num_thread)
     for (int ii = 0; ii < nn_size; ii++)
     {
         int i = ii * 4;
@@ -143,14 +142,14 @@ void input_pack4(int K, int N, float* pB, float* pB_t, int num_thread)
             tmp[1] = img[1];
             tmp[2] = img[2];
             tmp[3] = img[3];
-#endif    // __mips_msa
+#endif // __mips_msa
             tmp += 4;
             img += N;
         }
     }
 
-    // [ch00, ch01, ch02, ch03 ....]
-    #pragma omp parallel for num_threads(num_thread)
+// [ch00, ch01, ch02, ch03 ....]
+#pragma omp parallel for num_threads(num_thread)
     for (int i = remian_size_start; i < N; i++)
     {
         const float* img = pB + i;
@@ -175,13 +174,13 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
     nn_outch = M >> 2;
     remain_outch_start = nn_outch << 2;
 
-    // output ch0 - ch3
-    #pragma omp parallel for num_threads(num_thread)
-    for (int pp=0; pp<nn_outch; pp++)
+// output ch0 - ch3
+#pragma omp parallel for num_threads(num_thread)
+    for (int pp = 0; pp < nn_outch; pp++)
     {
-        int i =  pp * 4;
+        int i = pp * 4;
 
-        float* output0 = pC + ( i )*N;
+        float* output0 = pC + (i)*N;
         float* output1 = pC + (i + 1) * N;
         float* output2 = pC + (i + 2) * N;
         float* output3 = pC + (i + 3) * N;
@@ -206,10 +205,10 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
                 v4f32 _vb = (v4f32)__msa_ld_w(vb, 0);
                 v4i32 _va0123 = __msa_ld_w(va, 0);
 
-                _sum0 = __msa_fmadd_w(_sum0, _vb, (v4f32)__msa_splati_w(_va0123, 0));    // sum0 = (a00-a03) * k00
-                _sum1 = __msa_fmadd_w(_sum1, _vb, (v4f32)__msa_splati_w(_va0123, 1));    // sum1 = (a00-a03) * k10
-                _sum2 = __msa_fmadd_w(_sum2, _vb, (v4f32)__msa_splati_w(_va0123, 2));    // sum2 = (a00-a03) * k20
-                _sum3 = __msa_fmadd_w(_sum3, _vb, (v4f32)__msa_splati_w(_va0123, 3));    // sum3 = (a00-a03) * k30
+                _sum0 = __msa_fmadd_w(_sum0, _vb, (v4f32)__msa_splati_w(_va0123, 0)); // sum0 = (a00-a03) * k00
+                _sum1 = __msa_fmadd_w(_sum1, _vb, (v4f32)__msa_splati_w(_va0123, 1)); // sum1 = (a00-a03) * k10
+                _sum2 = __msa_fmadd_w(_sum2, _vb, (v4f32)__msa_splati_w(_va0123, 2)); // sum2 = (a00-a03) * k20
+                _sum3 = __msa_fmadd_w(_sum3, _vb, (v4f32)__msa_splati_w(_va0123, 3)); // sum3 = (a00-a03) * k30
 
                 va += 4;
                 vb += 4;
@@ -245,7 +244,7 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
                 output2[n] = sum2[n];
                 output3[n] = sum3[n];
             }
-#endif    // __mips_msa
+#endif // __mips_msa
             output0 += 4;
             output1 += 4;
             output2 += 4;
@@ -275,10 +274,10 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
                 v4f32 _va2 = (v4f32)__msa_ld_w(va + 8, 0);
                 v4f32 _va3 = (v4f32)__msa_ld_w(va + 12, 0);
 
-                _sum0 = __msa_fmadd_w(_sum0, _va0, (v4f32)__msa_splati_w(_vb0123, 0));    // sum0 += (k00-k30) * a00
-                _sum1 = __msa_fmadd_w(_sum1, _va1, (v4f32)__msa_splati_w(_vb0123, 1));    // sum1 += (k01-k31) * a10
-                _sum2 = __msa_fmadd_w(_sum2, _va2, (v4f32)__msa_splati_w(_vb0123, 2));    // sum2 += (k02-k32) * a20
-                _sum3 = __msa_fmadd_w(_sum3, _va3, (v4f32)__msa_splati_w(_vb0123, 3));    // sum3 += (k03-k33) * a30
+                _sum0 = __msa_fmadd_w(_sum0, _va0, (v4f32)__msa_splati_w(_vb0123, 0)); // sum0 += (k00-k30) * a00
+                _sum1 = __msa_fmadd_w(_sum1, _va1, (v4f32)__msa_splati_w(_vb0123, 1)); // sum1 += (k01-k31) * a10
+                _sum2 = __msa_fmadd_w(_sum2, _va2, (v4f32)__msa_splati_w(_vb0123, 2)); // sum2 += (k02-k32) * a20
+                _sum3 = __msa_fmadd_w(_sum3, _va3, (v4f32)__msa_splati_w(_vb0123, 3)); // sum3 += (k03-k33) * a30
 
                 va += 16;
                 vb += 4;
@@ -294,7 +293,7 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
                 v4f32 _vb0 = {vb[0], vb[0], vb[0], vb[0]};
                 v4f32 _va = (v4f32)__msa_ld_w(va, 0);
 
-                _sum0_3 = __msa_fmadd_w(_sum0_3, _va, _vb0);    // sum0 += (k00-k30) * a00
+                _sum0_3 = __msa_fmadd_w(_sum0_3, _va, _vb0); // sum0 += (k00-k30) * a00
 
                 va += 4;
                 vb += 1;
@@ -323,7 +322,7 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
             output1[0] = sum1;
             output2[0] = sum2;
             output3[0] = sum3;
-#endif    // __mips_msa
+#endif // __mips_msa
             output0++;
             output1++;
             output2++;
@@ -331,9 +330,9 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
         }
     }
 
-    // output ch0
-    #pragma omp parallel for num_threads(num_thread)
-    for (int i=remain_outch_start; i<M; i++)
+// output ch0
+#pragma omp parallel for num_threads(num_thread)
+    for (int i = remain_outch_start; i < M; i++)
     {
         float* output = pC + i * N;
 
@@ -357,10 +356,10 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
                 v4f32 _vb2 = (v4f32)__msa_ld_w(vb + 8, 0);
                 v4f32 _vb3 = (v4f32)__msa_ld_w(vb + 12, 0);
 
-                _sum0 = __msa_fmadd_w(_sum0, _vb0, (v4f32)__msa_splati_w(_va0123, 0));    // sum0 = (a00-a03) * k00
-                _sum0 = __msa_fmadd_w(_sum0, _vb1, (v4f32)__msa_splati_w(_va0123, 1));    // sum0 += (a10-a13) * k01
-                _sum0 = __msa_fmadd_w(_sum0, _vb2, (v4f32)__msa_splati_w(_va0123, 2));    // sum0 += (a20-a23) * k02
-                _sum0 = __msa_fmadd_w(_sum0, _vb3, (v4f32)__msa_splati_w(_va0123, 3));    // sum0 += (a30-a33) * k03
+                _sum0 = __msa_fmadd_w(_sum0, _vb0, (v4f32)__msa_splati_w(_va0123, 0)); // sum0 = (a00-a03) * k00
+                _sum0 = __msa_fmadd_w(_sum0, _vb1, (v4f32)__msa_splati_w(_va0123, 1)); // sum0 += (a10-a13) * k01
+                _sum0 = __msa_fmadd_w(_sum0, _vb2, (v4f32)__msa_splati_w(_va0123, 2)); // sum0 += (a20-a23) * k02
+                _sum0 = __msa_fmadd_w(_sum0, _vb3, (v4f32)__msa_splati_w(_va0123, 3)); // sum0 += (a30-a33) * k03
 
                 va += 4;
                 vb += 16;
@@ -372,7 +371,7 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
                 v4f32 _va0 = {va[0]};
                 v4f32 _vb0 = (v4f32)__msa_ld_w(vb, 0);
 
-                _sum0 = __msa_fmadd_w(_sum0, _vb0, _va0);    // sum0 = (a00-a03) * k00
+                _sum0 = __msa_fmadd_w(_sum0, _vb0, _va0); // sum0 = (a00-a03) * k00
 
                 va += 1;
                 vb += 4;
@@ -396,7 +395,7 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
             {
                 output[n] = sum[n];
             }
-#endif    // __mips_msa
+#endif // __mips_msa
             output += 4;
         }
 
@@ -423,7 +422,7 @@ static void sgemm(int M, int N, int K, float* pA_t, float* pB_t, float* pC, int
             float sum0 = _sum0[0] + _sum0[1] + _sum0[2] + _sum0[3];
 #else
             float sum0 = 0.f;
-#endif    // __mips_msa
+#endif // __mips_msa
             for (; k < K; k++)
             {
                 sum0 += va[0] * vb[0];
@@ -449,13 +448,13 @@ static void sgemm_fp32(struct tensor* input, struct tensor* filter, struct tenso
     int out_w = output->dims[3];
     int out_image_size = output->dims[1] * output->dims[2] * output->dims[3];
 
-    float* interleave_fp32 = ( float* )priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size;
+    float* interleave_fp32 = (float*)priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size;
     float* im2col_pack4_fp32 = priv_info->im2col_buffer_pack4;
-    float* output_fp32 = ( float* )output->data + n * out_image_size + outchan_g * group * out_h * out_w;
+    float* output_fp32 = (float*)output->data + n * out_image_size + outchan_g * group * out_h * out_w;
     float* bias_fp32 = NULL;
 
     if (bias)
-        bias_fp32 = ( float* )bias->data + outchan_g * group;
+        bias_fp32 = (float*)bias->data + outchan_g * group;
 
     float* filter_sgemm = interleave_fp32;
     float* input_sgemm_pack4 = im2col_pack4_fp32;
@@ -525,8 +524,7 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w)
     if (in_h <= 10 && in_w <= 10)
         return 0;
 
-    if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 ||
-        dilation_w != 1 || input_chan < 16 || output_chan < 16)
+    if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 || dilation_w != 1 || input_chan < 16 || output_chan < 16)
         return 0;
 
     return 1;
@@ -560,8 +558,8 @@ int conv_hcl_get_interleave_pack4_size(int M, int K, struct tensor* filter)
 
 void conv_hcl_interleave_pack4(int M, int K, struct conv_priv_info* priv_info)
 {
-    float* pA = ( float* )priv_info->interleave_buffer;
-    float* pA_t = ( float* )priv_info->interleave_buffer_pack4;
+    float* pA = (float*)priv_info->interleave_buffer;
+    float* pA_t = (float*)priv_info->interleave_buffer_pack4;
 
     int nn_outch = M >> 2;
     int remain_outch_start = nn_outch << 2;
@@ -674,8 +672,7 @@ int conv_hcl_postrun(struct conv_priv_info* priv_info)
         return wino_conv_hcl_postrun(priv_info);
     }
 
-    if (priv_info->external_interleave_pack4_mem && !priv_info->external_interleave_mem &&
-        priv_info->interleave_buffer != NULL)
+    if (priv_info->external_interleave_pack4_mem && !priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL)
     {
         sys_free(priv_info->interleave_buffer_pack4);
         priv_info->interleave_buffer_pack4 = NULL;
@@ -713,7 +710,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru
                                  cpu_affinity);
     }
 
-    for (int i = 0; i < input_tensor->dims[0]; i++)    // batch size
+    for (int i = 0; i < input_tensor->dims[0]; i++) // batch size
     {
         for (int j = 0; j < group; j++)
         {
diff --git a/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.c b/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.c
index c3d8ff789..19ede63f5 100644
--- a/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.c
+++ b/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.c
@@ -27,7 +27,7 @@
 
 #include "wino_conv_kernel_mips.h"
 
-#define TILE 4
+#define TILE      4
 #define ELEM_SIZE ((TILE + 2) * (TILE + 2))
 
 #define WINO_MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -37,11 +37,11 @@ static void relu(float* data, int size, int activation)
 {
     for (int i = 0; i < size; i++)
     {
-        data[i] = WINO_MAX(data[i], ( float )0);
+        data[i] = WINO_MAX(data[i], (float)0);
 
         if (activation > 0)
         {
-            data[i] = WINO_MIN(data[i], ( float )activation);
+            data[i] = WINO_MIN(data[i], (float)activation);
         }
     }
 }
@@ -50,7 +50,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param)
     int output_c = filter->dims[0];
     int input_c = filter->dims[1];
     int trans_ker_size = output_c * input_c * ELEM_SIZE * sizeof(float);
-    return trans_ker_size + 128;    // caution
+    return trans_ker_size + 128; // caution
 }
 
 static void pad_0_align_2D(float* dst, float* src, int m, int n, int m_align, int n_align, int pad_h, int pad_w)
@@ -132,7 +132,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
         int w_tm = outw_align / 4 * 6;
         int h_tm = outh_align / 4 * 6;
 
-        int nColBlocks = h_tm / 6;    // may be the block num in Feathercnn
+        int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
         int nRowBlocks = w_tm / 6;
 
         const int tiles = nColBlocks * nRowBlocks;
@@ -164,7 +164,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
         // 4 =	2 * r01 - r02 - 2 * r03 + r04
         // 5 =	4 * r01 - 5 * r03 + r05
 
-        #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
         for (int q = 0; q < inch; q++)
         {
             const float* img = bottom_blob_bordered + q * w * h;
@@ -322,7 +322,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
         int w_tm = outw_align / 4 * 6;
         int h_tm = outh_align / 4 * 6;
 
-        int nColBlocks = h_tm / 6;    // may be the block num in Feathercnn
+        int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
         int nRowBlocks = w_tm / 6;
 
         const int tiles = nColBlocks * nRowBlocks;
@@ -330,7 +330,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
 
         top_blob_tm = dot_block;
 
-        #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
         for (int r = 0; r < 9; r++)
         {
             int nn_outch = 0;
@@ -533,7 +533,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
                         output6_tm[n] = sum6[n];
                         output7_tm[n] = sum7[n];
                     }
-#endif    // __mips_msa
+#endif // __mips_msa
                     output0_tm += 36;
                     output1_tm += 36;
                     output2_tm += 36;
@@ -617,7 +617,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
                         output2_tm[n] = sum2[n];
                         output3_tm[n] = sum3[n];
                     }
-#endif    // __mips_msa
+#endif // __mips_msa
                     output0_tm += 36;
                     output1_tm += 36;
                     output2_tm += 36;
@@ -658,7 +658,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
                     {
                         for (int n = 0; n < 4; n++)
                         {
-                            sum0[n] += ( int )r0[n] * kptr[n];
+                            sum0[n] += (int)r0[n] * kptr[n];
                         }
                         kptr += 4;
                         r0 += 4;
@@ -668,7 +668,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
                     {
                         output0_tm[n] = sum0[n];
                     }
-#endif    // __mips_msa
+#endif // __mips_msa
                     output0_tm += 36;
                 }
             }
@@ -703,12 +703,12 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
         int w_tm = outw_align / 4 * 6;
         int h_tm = outh_align / 4 * 6;
 
-        int nColBlocks = h_tm / 6;    // may be the block num in Feathercnn
+        int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
         int nRowBlocks = w_tm / 6;
 
         const int tiles = nColBlocks * nRowBlocks;
 
-        #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
         for (int p = 0; p < outch; p++)
         {
             float* out_tile = top_blob_tm + 36 * tiles * p;
@@ -816,14 +816,13 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
 
 void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kernel_wino, int inch, int outch)
 {
-    float* kernel_tm = ( float* )sys_malloc(6 * 6 * inch * outch * sizeof(float));
+    float* kernel_tm = (float*)sys_malloc(6 * 6 * inch * outch * sizeof(float));
 
     // G
     const float ktm[6][3] = {
-        {1.0f / 4, 0.0f, 0.0f},           {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6},
-        {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}};
+        {1.0f / 4, 0.0f, 0.0f}, {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6}, {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}};
 
-    #pragma omp parallel for
+#pragma omp parallel for
     for (int p = 0; p < outch; p++)
     {
         for (int q = 0; q < inch; q++)
@@ -864,14 +863,14 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne
         int p = 0;
         for (; p + 7 < outch; p += 8)
         {
-            const float* kernel0 = ( const float* )kernel_tm + p * inch * 36;
-            const float* kernel1 = ( const float* )kernel_tm + (p + 1) * inch * 36;
-            const float* kernel2 = ( const float* )kernel_tm + (p + 2) * inch * 36;
-            const float* kernel3 = ( const float* )kernel_tm + (p + 3) * inch * 36;
-            const float* kernel4 = ( const float* )kernel_tm + (p + 4) * inch * 36;
-            const float* kernel5 = ( const float* )kernel_tm + (p + 5) * inch * 36;
-            const float* kernel6 = ( const float* )kernel_tm + (p + 6) * inch * 36;
-            const float* kernel7 = ( const float* )kernel_tm + (p + 7) * inch * 36;
+            const float* kernel0 = (const float*)kernel_tm + p * inch * 36;
+            const float* kernel1 = (const float*)kernel_tm + (p + 1) * inch * 36;
+            const float* kernel2 = (const float*)kernel_tm + (p + 2) * inch * 36;
+            const float* kernel3 = (const float*)kernel_tm + (p + 3) * inch * 36;
+            const float* kernel4 = (const float*)kernel_tm + (p + 4) * inch * 36;
+            const float* kernel5 = (const float*)kernel_tm + (p + 5) * inch * 36;
+            const float* kernel6 = (const float*)kernel_tm + (p + 6) * inch * 36;
+            const float* kernel7 = (const float*)kernel_tm + (p + 7) * inch * 36;
 
             float* ktmp = kernel_tm_test + p / 8 * inch * 32;
 
@@ -931,10 +930,10 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne
 
         for (; p + 3 < outch; p += 4)
         {
-            const float* kernel0 = ( const float* )kernel_tm + p * inch * 36;
-            const float* kernel1 = ( const float* )kernel_tm + (p + 1) * inch * 36;
-            const float* kernel2 = ( const float* )kernel_tm + (p + 2) * inch * 36;
-            const float* kernel3 = ( const float* )kernel_tm + (p + 3) * inch * 36;
+            const float* kernel0 = (const float*)kernel_tm + p * inch * 36;
+            const float* kernel1 = (const float*)kernel_tm + (p + 1) * inch * 36;
+            const float* kernel2 = (const float*)kernel_tm + (p + 2) * inch * 36;
+            const float* kernel3 = (const float*)kernel_tm + (p + 3) * inch * 36;
 
             float* ktmp = kernel_tm_test + (p / 8 + (p % 8) / 4) * inch * 32;
             for (int q = 0; q < inch; q++)
@@ -969,7 +968,7 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne
 
         for (; p < outch; p++)
         {
-            const float* kernel0 = ( const float* )kernel_tm + p * inch * 36;
+            const float* kernel0 = (const float*)kernel_tm + p * inch * 36;
             float* ktmp = kernel_tm_test + (p / 8 + (p % 8) / 4 + p % 4) * inch * 32;
 
             for (int q = 0; q < inch; q++)
@@ -1003,7 +1002,7 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
     int pad_h = param->pad_h0;
     int pad_w = param->pad_w0;
 
-    float* kernel = ( float* )filter_tensor->data;
+    float* kernel = (float*)filter_tensor->data;
 
     if (!priv_info->external_interleave_mem)
     {
@@ -1023,17 +1022,17 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
 
     int outw = block_w * TILE;
     int outh = block_h * TILE;
-    priv_info->input_pad = ( float* )sys_malloc(batch * input_c * pad_inhw * sizeof(float));
+    priv_info->input_pad = (float*)sys_malloc(batch * input_c * pad_inhw * sizeof(float));
     memset(priv_info->input_pad, 0, batch * input_c * pad_inhw * sizeof(float));
-    priv_info->dot_block = ( float* )sys_malloc(ELEM_SIZE * block * output_c * sizeof(float));
-    priv_info->transform_input = ( float* )sys_malloc(ELEM_SIZE * block * input_c * sizeof(float));
+    priv_info->dot_block = (float*)sys_malloc(ELEM_SIZE * block * output_c * sizeof(float));
+    priv_info->transform_input = (float*)sys_malloc(ELEM_SIZE * block * input_c * sizeof(float));
     priv_info->output_bordered = NULL;
     if (outw != output_w || outh != output_h)
     {
-        priv_info->output_bordered = ( float* )sys_malloc(outw * outh * output_c * sizeof(float));
+        priv_info->output_bordered = (float*)sys_malloc(outw * outh * output_c * sizeof(float));
     }
 
-    conv3x3s1_winograd43_transform_kernel_sse(kernel, ( float* )priv_info->interleave_buffer, input_c, output_c);
+    conv3x3s1_winograd43_transform_kernel_sse(kernel, (float*)priv_info->interleave_buffer, input_c, output_c);
 
     return 0;
 }
@@ -1111,11 +1110,11 @@ int wino_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor,
     int padded_in_hw = padded_in_h * padded_in_w;
 
     /* buffer addr */
-    float* input = ( float* )input_tensor->data;
-    float* output = ( float* )output_tensor->data;
+    float* input = (float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
     float* biases = NULL;
     if (bias_tensor != NULL)
-        biases = ( float* )bias_tensor->data;
+        biases = (float*)bias_tensor->data;
 
     pad_0_align_3D(priv_info->input_pad, input, in_h, in_w, padded_in_h, padded_in_w, in_c, pad_h0, pad_w0);
 
diff --git a/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.h b/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.h
index 4a2610126..aee0540a3 100644
--- a/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.h
+++ b/source/device/cpu/op/conv/mips/wino_conv_kernel_mips.h
@@ -30,14 +30,12 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 #if __mips_msa
 #include <msa.h>
 #endif
 
 int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor,
-                         struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param)
-   ;
+                         struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param);
 
 int wino_conv_hcl_postrun(struct conv_priv_info* info);
 
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c
index 7eab21fd0..338827acd 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c
@@ -39,7 +39,6 @@
 #include <math.h>
 #include <string.h>
 
-
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -54,16 +53,16 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (ir_node->input_num > 2)
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     int ret = -1;
     if (exec_graph->mode == TENGINE_MODE_FP32)
         ret = conv_dw_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, cpu_affinity);
     else
     {
-            TLOG_ERR("hcl conv run failed\n");
-            return -1;
+        TLOG_ERR("hcl conv run failed\n");
+        return -1;
     }
 
     return ret;
@@ -81,7 +80,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
-    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
     struct node* ir_node = exec_node;
     struct graph* ir_graph = ir_node->graph;
 
@@ -113,12 +112,10 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     if (kernel_h != kernel_w || input_tensor->dims[0] > 1)
         return 0;
 
-    if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 &&
-        ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
+    if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
+        return OPS_SCORE_BEST;
+    else if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 5 && kernel_w == 5 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
         return OPS_SCORE_BEST;
-    else if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 5 && kernel_w == 5 &&
-        ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
-        return OPS_SCORE_BEST;        
     else
         return 0;
 }
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.c
index 18e6ef238..a7b45fbc0 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.c
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.c
@@ -54,7 +54,6 @@
 #include <string.h>
 #include <math.h>
 
-
 #define max(a, b) ((a) > (b) ? (a) : (b))
 #define min(a, b) ((a) < (b) ? (a) : (b))
 
@@ -62,11 +61,11 @@ static void relu(float* data, int size, int activation)
 {
     for (int i = 0; i < size; i++)
     {
-        data[i] = max(data[i], ( float )0);
+        data[i] = max(data[i], (float)0);
 
         if (activation > 0)
         {
-            data[i] = min(data[i], ( float )activation);
+            data[i] = min(data[i], (float)activation);
         }
     }
 }
@@ -319,7 +318,7 @@ static void convdw5x5s1(float* output, float* input, float* _kernel, float* _bia
     int c_step_out = outw * outh;
 
     const int group = channel;
-    const float* kernel = _kernel;    
+    const float* kernel = _kernel;
 
 #pragma omp parallel for num_threads(num_thread)
     for (int g = 0; g < group; g++)
@@ -597,12 +596,12 @@ static void convdw5x5s2(float* output, float* input, float* _kernel, float* _bia
 int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor,
                 struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity)
 {
-    float* input = ( float* )input_tensor->data;
-    float* output = ( float* )output_tensor->data;
-    float* kernel = ( float* )weight_tensor->data;
+    float* input = (float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
+    float* kernel = (float*)weight_tensor->data;
     float* biases = NULL;
     if (bias_tensor)
-        biases = ( float* )bias_tensor->data;
+        biases = (float*)bias_tensor->data;
 
     int batch_number = input_tensor->dims[0];
     int inc = input_tensor->dims[1];
@@ -637,8 +636,8 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc
         input_tmp = input;
     else
     {
-        input_tmp = ( float* )sys_malloc(inh_tmp * inw_tmp * group * sizeof(float));
-#pragma omp parallel for num_threads(num_thread)        
+        input_tmp = (float*)sys_malloc(inh_tmp * inw_tmp * group * sizeof(float));
+#pragma omp parallel for num_threads(num_thread)
         for (int g = 0; g < group; g++)
         {
             float* pad_in = input + g * inh * inw;
@@ -650,13 +649,13 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc
     /* process */
     for (int i = 0; i < batch_number; i++)
     {
-        if (ksize_h ==3 && stride_h == 1)
+        if (ksize_h == 3 && stride_h == 1)
             convdw3x3s1(output, input_tmp, kernel, biases, group, inh_tmp, inw_tmp, outh, outw, num_thread);
-        else if  (ksize_h ==3 && stride_h == 2)
+        else if (ksize_h == 3 && stride_h == 2)
             convdw3x3s2(output, input_tmp, kernel, biases, group, inh_tmp, inw_tmp, outh, outw, num_thread);
-        else if  (ksize_h ==5 && stride_h == 1)
+        else if (ksize_h == 5 && stride_h == 1)
             convdw5x5s1(output, input_tmp, kernel, biases, group, inh_tmp, inw_tmp, outh, outw, num_thread);
-        else if  (ksize_h ==5 && stride_h == 2)
+        else if (ksize_h == 5 && stride_h == 2)
             convdw5x5s2(output, input_tmp, kernel, biases, group, inh_tmp, inw_tmp, outh, outw, num_thread);
         else
             TLOG_ERR("convdw %d x %d, s %d not support.\n", ksize_h, ksize_w, stride_h);
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.h b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.h
index a08006b87..0a6276579 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.h
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_kernel_rv64.h
@@ -31,7 +31,6 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
                 struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity);
 
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c
index 8cd3bfcf4..ac7333ff0 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c
@@ -39,7 +39,6 @@
 
 #include "string.h"
 
-
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -48,8 +47,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* get cpu affinity */
     conv_priv_info->cpu_type = exec_graph->cpu_affinity;
@@ -68,7 +67,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
         if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size)
         {
             if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem,
-                                              exec_graph->shared_pack4_mem_size) < 0)
+                                              exec_graph->shared_pack4_mem_size)
+                < 0)
             {
                 TLOG_ERR("hcl conv: set shared pack4 memory failed\n");
                 return -1;
@@ -117,14 +117,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (ir_node->input_num > 2)
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 run */
     if (exec_graph->mode == TENGINE_MODE_FP32)
     {
         if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread,
-                         cpu_affinity) < 0)
+                         cpu_affinity)
+            < 0)
         {
             TLOG_ERR("hcl conv run failed\n");
             return -1;
@@ -146,7 +147,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 postrun */
     if (exec_graph->mode == TENGINE_MODE_FP32)
@@ -178,10 +179,10 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
     filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     /* init the private info data of convolution op */
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info));
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info));
     if (conv_priv_info == NULL)
     {
         return -1;
@@ -206,7 +207,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
 
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
     sys_free(conv_priv_info);
     exec_node->ops_priv = NULL;
 
@@ -219,7 +220,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
     int group = param->group;
     int kernel_h = param->kernel_h;
     int kernel_w = param->kernel_w;
@@ -235,16 +236,14 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_PREFER;
 }
 
-
 static struct node_ops hcl_node_ops = {
-        .prerun = prerun,
-        .run = run,
-        .reshape = reshape,
-        .postrun = postrun,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score
-};
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score};
 
 int register_conv_hcl_rv64_op()
 {
@@ -256,4 +255,3 @@ int unregister_conv_hcl_rv64_op()
     unregister_builtin_node_ops(OP_CONV, &hcl_node_ops);
     return 0;
 }
-
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c
index 3666bab6e..999a49d4e 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c
@@ -32,9 +32,9 @@
 
 #define PER_OUT_CHAN 16
 void sgemm_4x16_rv64(float* biases, float* input, float* kernel, long kernel_size, float* output, long output_xy,
-                    int activation, int layout);
+                     int activation, int layout);
 void sgemm_4x4_rv64(float* biases, float* input, float* kernel, long kernel_size, float* output, long output_xy,
-                   int activation, int layout);
+                    int activation, int layout);
 
 void im2col_fp32_1x1(float* input, int input_xy, float* col, int col_cnt, int input_chan);
 void im2col_fp32_3x3(float* input, int w, int h, int channel, float* cur_col, int stride);
@@ -103,9 +103,9 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern
 /* kernel interleave */
 static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param)
 {
-    int group       = param->group;
+    int group = param->group;
     int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3];
-    int out_chan    = filter->dims[0] / group;
+    int out_chan = filter->dims[0] / group;
     int out_chan_align4 = (out_chan + 3) / 4 * 4;
 
     int kernel_size_algin = kernel_size * out_chan_align4;
@@ -115,7 +115,7 @@ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info,
     float* interleave_buf = priv_info->interleave_buffer;
     for (int g = 0; g < group; g++)
     {
-        float* cur_kernel     = kernel + g * kernel_size_group;
+        float* cur_kernel = kernel + g * kernel_size_group;
         float* cur_interleave = interleave_buf + g * kernel_size_algin;
         interleave_kernel(cur_kernel, cur_interleave, out_chan, kernel_size);
     }
@@ -130,14 +130,13 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k
         int in_xy = in_w * in_h;
         int out_xy = out_w * out_h;
         int col_end3 = out_xy & 3;
-        #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
         for (int col_i = 0; col_i < out_xy - 3; col_i += 4)
         {
             float* cur_col = col + col_i * kernel_size;
 
             float* cur_input = input + col_i;
             im2col_fp32_1x1(cur_input, in_xy, cur_col, 4, in_c);
-
         }
         int col_i = out_xy & -4;
         float* cur_col;
@@ -164,7 +163,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k
         int out_xy = out_w * out_h;
         int col_end3 = out_xy & 3;
         int is_pad0 = (pad_w0 == 0) && (pad_h0 == 0) && (pad_w1 == 0) && (pad_h1 == 0);
-        #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
         for (int col_i = 0; col_i < (out_xy & -4); col_i += 4)
         {
             float* cur_col = col + col_i * kernel_size;
@@ -176,7 +175,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k
             {
                 float* l0 = input + (imy0 * s_h - pad_h0) * in_w + (imx0 * s_w - pad_w0);
                 {
-                    im2col_fp32_3x3(l0, in_w, in_h, in_c, cur_col, s_w);         // add im2col 3x3
+                    im2col_fp32_3x3(l0, in_w, in_h, in_c, cur_col, s_w); // add im2col 3x3
                     cur_col += 4 * kernel_size;
                 }
             }
@@ -239,7 +238,7 @@ static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k
     else
     {
         int out_xy = out_w * out_h;
-        #pragma omp parallel for num_threads(num_thread)
+#pragma omp parallel for num_threads(num_thread)
         for (int col_i = 0; col_i < out_xy - 3; col_i += 4)
         {
             int kernel_size = k_w * k_h * in_c;
@@ -314,20 +313,20 @@ static void sgemm_set(float* col, float* kernel, float* biases, float* output, i
         {
             int p = pp * PER_OUT_CHAN;
 
-            float* biasptr = biases ? ( float* )(biases + p) : NULL;
-            float* kernel_tmp = ( float* )(kernel + p * kernel_size);
-            float* output_tmp = ( float* )(output + p * output_xy);
+            float* biasptr = biases ? (float*)(biases + p) : NULL;
+            float* kernel_tmp = (float*)(kernel + p * kernel_size);
+            float* output_tmp = (float*)(output + p * output_xy);
 
             int col_line = 0;
             for (col_line = 0; col_line + 3 < output_xy; col_line += 4)
             {
-                float* col_tmp = ( float* )(col + col_line * kernel_size);
-                sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0);      // FIXME: replace with sgemm_4x16_rv64
+                float* col_tmp = (float*)(col + col_line * kernel_size);
+                sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64
             }
             {
                 float result[64];
-                float* col_tmp = ( float* )(col + col_line * kernel_size);
-                sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0);         // FIXME: replace with sgemm_4x16_rv64
+                float* col_tmp = (float*)(col + col_line * kernel_size);
+                sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0); // FIXME: replace with sgemm_4x16_rv64
                 for (int i = 0; i < 16; i++)
                 {
                     for (int j = 0; j < (col_end3); j++)
@@ -343,14 +342,14 @@ static void sgemm_set(float* col, float* kernel, float* biases, float* output, i
         {
             int p = pp * PER_OUT_CHAN;
 
-            float* biasptr = biases ? ( float* )(biases + p) : NULL;
-            float* kernel_tmp = ( float* )(kernel + p * kernel_size);
-            float* output_tmp = ( float* )(output + p * output_xy);
+            float* biasptr = biases ? (float*)(biases + p) : NULL;
+            float* kernel_tmp = (float*)(kernel + p * kernel_size);
+            float* output_tmp = (float*)(output + p * output_xy);
 
             for (int col_line = 0; col_line + 3 < output_xy; col_line += 4)
             {
-                float* col_tmp = ( float* )(col + col_line * kernel_size);
-                sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0);          // FIXME: replace with sgemm_4x16_rv64
+                float* col_tmp = (float*)(col + col_line * kernel_size);
+                sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64
             }
         }
     }
@@ -364,23 +363,23 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in
     int kernel_end3 = ch_end & 0x3;
 
 #pragma omp parallel for num_threads(num_thread) private(result)
-    for (int kernel_num = ch_start; kernel_num  < ((ch_end & -4)-3); kernel_num += 4)
+    for (int kernel_num = ch_start; kernel_num < ((ch_end & -4) - 3); kernel_num += 4)
     {
         float* cur_biases = NULL;
         float *cur_col, *cur_kernel, *cur_output;
         int col_line;
         if (biases)
-            cur_biases = ( float* )(biases + kernel_num);
-        cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
-        cur_output = ( float* )(output + kernel_num * output_xy);
+            cur_biases = (float*)(biases + kernel_num);
+        cur_kernel = (float*)(kernel + kernel_num * kernel_size);
+        cur_output = (float*)(output + kernel_num * output_xy);
         for (col_line = 0; col_line < (output_xy & -4); col_line += 4)
         {
-            cur_col = ( float* )(col + col_line * kernel_size);
+            cur_col = (float*)(col + col_line * kernel_size);
             sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, cur_output + col_line, output_xy, activation, 0);
         }
         if (col_end3)
         {
-            cur_col = ( float* )(col + col_line * kernel_size);
+            cur_col = (float*)(col + col_line * kernel_size);
             sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
             for (int i = 0; i < 4; i++)
             {
@@ -394,13 +393,13 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in
         int kernel_num = (ch_end & -4);
         float* cur_biases = NULL;
         if (biases)
-            cur_biases = ( float* )(biases + kernel_num);
-        float* cur_kernel = ( float* )(kernel + kernel_num * kernel_size);
-        #pragma omp parallel for num_threads(num_thread) private(result)
+            cur_biases = (float*)(biases + kernel_num);
+        float* cur_kernel = (float*)(kernel + kernel_num * kernel_size);
+#pragma omp parallel for num_threads(num_thread) private(result)
         for (int col_line = 0; col_line < (output_xy & -4); col_line += 4)
         {
-            float* cur_col = ( float* )(col + col_line * kernel_size);
-            sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);   
+            float* cur_col = (float*)(col + col_line * kernel_size);
+            sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
             for (int i = 0; i < kernel_end3; i++)
                 for (int j = 0; j < 4; j++)
                     *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
@@ -408,8 +407,8 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in
         int col_line = output_xy & -4;
         if (col_end3)
         {
-            float* cur_col = ( float* )(col + col_line * kernel_size);
-            sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);   
+            float* cur_col = (float*)(col + col_line * kernel_size);
+            sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
             for (int i = 0; i < (kernel_end3); i++)
             {
                 for (int j = 0; j < (col_end3); j++)
@@ -448,15 +447,15 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w)
  */
 int conv_hcl_get_shared_mem_size_rv64(struct tensor* input, struct tensor* output, struct conv_param* param)
 {
-    int in_h  = input->dims[2];
-    int in_w  = input->dims[3];
+    int in_h = input->dims[2];
+    int in_w = input->dims[3];
     int out_h = output->dims[2];
     int out_w = output->dims[3];
     int group = param->group;
-    int input_chan  = param->input_channel / group;
+    int input_chan = param->input_channel / group;
     int kernel_size = input_chan * param->kernel_h * param->kernel_w;
-    int out_cstep   = out_h * out_w;      // channel cstep, output_h * output_w
-    int elem_size   = input->elem_size;   // uint8/int8 is 1 byte, fp32 is 4 bytes
+    int out_cstep = out_h * out_w;    // channel cstep, output_h * output_w
+    int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes
 
     out_cstep = (out_cstep + 3) / 4 * 4;
     int mem_size = elem_size * kernel_size * out_cstep + 128;
@@ -473,7 +472,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param)
     int out_chan = filter->dims[0] / group;
     int out_chan_align4 = (out_chan + 3) / 4 * 4;
     int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3];
-    int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128;    // caution
+    int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution
 
     return mem_size;
 }
@@ -523,7 +522,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s
     {
         int mem_size = conv_hcl_get_shared_mem_size_rv64(input_tensor, output_tensor, param);
         void* mem = sys_malloc(mem_size);
-        priv_info->im2col_buffer      = mem;
+        priv_info->im2col_buffer = mem;
         priv_info->im2col_buffer_size = mem_size;
     }
 
@@ -532,7 +531,7 @@ int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, s
     {
         int mem_size = get_private_mem_size(filter_tensor, param);
         void* mem = sys_malloc(mem_size);
-        priv_info->interleave_buffer      = mem;
+        priv_info->interleave_buffer = mem;
         priv_info->interleave_buffer_size = mem_size;
     }
 
@@ -607,18 +606,18 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru
     int output_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3];
 
     /* buffer addr */
-    float* input_buf = ( float* )input_tensor->data;
-    float* output_buf = ( float* )output_tensor->data;
+    float* input_buf = (float*)input_tensor->data;
+    float* output_buf = (float*)output_tensor->data;
     float* biases_buf = NULL;
     if (bias_tensor != NULL)
-        biases_buf = ( float* )bias_tensor->data;
-    float* col_buf = ( float* )priv_info->im2col_buffer;
-    float* interleave_buf = ( float* )priv_info->interleave_buffer;
+        biases_buf = (float*)bias_tensor->data;
+    float* col_buf = (float*)priv_info->im2col_buffer;
+    float* interleave_buf = (float*)priv_info->interleave_buffer;
 
     int sgemm_set_chan = out_c / PER_OUT_CHAN * PER_OUT_CHAN;
     int sgemm_set_remain = out_c % PER_OUT_CHAN;
 
-    for (int n = 0; n < batch; n++)    // batch size
+    for (int n = 0; n < batch; n++) // batch size
     {
         for (int g = 0; g < group; g++)
         {
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h
index 9a49bffa1..f2f9051a6 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h
@@ -49,7 +49,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru
                  int num_thread, int cpu_affinity) __attribute__((weak));
 
 int conv_hcl_get_shared_mem_size_rv64(struct tensor* input_tensor, struct tensor* output_tensor,
-                                 struct conv_param* param);
+                                      struct conv_param* param);
 int conv_hcl_get_shared_pack4_mem_size(struct tensor* input_tensor, struct tensor* output_tensor,
                                        struct conv_param* param) __attribute__((weak));
 
diff --git a/source/device/cpu/op/conv/x86/conv_direct_hcl_int8_x86.c b/source/device/cpu/op/conv/x86/conv_direct_hcl_int8_x86.c
index 7948e34e8..43647e551 100644
--- a/source/device/cpu/op/conv/x86/conv_direct_hcl_int8_x86.c
+++ b/source/device/cpu/op/conv/x86/conv_direct_hcl_int8_x86.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 static void pad_int8(int8_t* input, int8_t* output, int in_h, int in_w, int out_h, int out_w, int top, int left, int8_t v)
 {
     int8_t* ptr = input;
@@ -94,7 +93,7 @@ static void pad_int8(int8_t* input, int8_t* output, int in_h, int in_w, int out_
 }
 
 static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor,
-                               struct tensor* output_tensor, struct conv_param* param, int num_thread)
+                              struct tensor* output_tensor, struct conv_param* param, int num_thread)
 {
     int inch = input_tensor->dims[1];
     int inh = input_tensor->dims[2];
@@ -115,9 +114,9 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight
     float* output_fp32 = (float*)sys_malloc(out_size * sizeof(float));
 
     int8_t* output_int8 = (int8_t*)output_tensor->data;
-    int8_t* input_int8  = (int8_t*)input_tensor->data;
+    int8_t* input_int8 = (int8_t*)input_tensor->data;
     int32_t* bias_int32 = NULL;
-    if(bias_tensor)
+    if (bias_tensor)
         bias_int32 = (int32_t*)bias_tensor->data;
 
     /* get scale value of quantizaiton */
@@ -135,8 +134,8 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight
         input_tmp = input_int8;
     else
     {
-        input_tmp = ( int8_t* )sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t));
-#pragma omp parallel for num_threads(num_thread)        
+        input_tmp = (int8_t*)sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t));
+#pragma omp parallel for num_threads(num_thread)
         for (int g = 0; g < inch; g++)
         {
             int8_t* pad_in = input_int8 + g * inh * inw;
@@ -149,7 +148,7 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight
     for (int p = 0; p < outch; p++)
     {
         int32_t* out0 = output_int32 + p * out_hw;
-        int8_t* kernel0 = (int8_t* )kernel + p * inch * 9;
+        int8_t* kernel0 = (int8_t*)kernel + p * inch * 9;
 
         for (int q = 0; q < inch; q++)
         {
@@ -169,15 +168,15 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight
                 {
                     int sum0 = 0;
 
-                    sum0 += ( int )r0[0] * kernel0[0];
-                    sum0 += ( int )r0[1] * kernel0[1];
-                    sum0 += ( int )r0[2] * kernel0[2];
-                    sum0 += ( int )r1[0] * kernel0[3];
-                    sum0 += ( int )r1[1] * kernel0[4];
-                    sum0 += ( int )r1[2] * kernel0[5];
-                    sum0 += ( int )r2[0] * kernel0[6];
-                    sum0 += ( int )r2[1] * kernel0[7];
-                    sum0 += ( int )r2[2] * kernel0[8];
+                    sum0 += (int)r0[0] * kernel0[0];
+                    sum0 += (int)r0[1] * kernel0[1];
+                    sum0 += (int)r0[2] * kernel0[2];
+                    sum0 += (int)r1[0] * kernel0[3];
+                    sum0 += (int)r1[1] * kernel0[4];
+                    sum0 += (int)r1[2] * kernel0[5];
+                    sum0 += (int)r2[0] * kernel0[6];
+                    sum0 += (int)r2[1] * kernel0[7];
+                    sum0 += (int)r2[2] * kernel0[8];
 
                     *outptr0 += sum0;
 
@@ -204,9 +203,9 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight
         {
             int output_off = i * (outh * outw) + j;
             if (bias_tensor)
-                output_fp32[output_off] = (float )(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
+                output_fp32[output_off] = (float)(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
             else
-                output_fp32[output_off] = (float )output_int32[output_off] * input_scale * kernel_scales[i];
+                output_fp32[output_off] = (float)output_int32[output_off] * input_scale * kernel_scales[i];
         }
     }
 
@@ -252,7 +251,7 @@ static int conv3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight
         {
             int output_off = i * (outh * outw) + j;
 
-            int32_t data_i32 = ( int32_t )(round(output_fp32[output_off] / output_scale));
+            int32_t data_i32 = (int32_t)(round(output_fp32[output_off] / output_scale));
             if (data_i32 > 127)
                 data_i32 = 127;
             else if (data_i32 < -127)
@@ -292,9 +291,9 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight
     float* output_fp32 = (float*)sys_malloc(out_size * sizeof(float));
 
     int8_t* output_int8 = (int8_t*)output_tensor->data;
-    int8_t* input_int8  = (int8_t*)input_tensor->data;
+    int8_t* input_int8 = (int8_t*)input_tensor->data;
     int32_t* bias_int32 = NULL;
-    if(bias_tensor)
+    if (bias_tensor)
         bias_int32 = (int32_t*)bias_tensor->data;
 
     /* get scale value of quantizaiton */
@@ -312,8 +311,8 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight
         input_tmp = input_int8;
     else
     {
-        input_tmp = ( int8_t* )sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t));
-#pragma omp parallel for num_threads(num_thread)        
+        input_tmp = (int8_t*)sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t));
+#pragma omp parallel for num_threads(num_thread)
         for (int g = 0; g < inch; g++)
         {
             int8_t* pad_in = input_int8 + g * inh * inw;
@@ -328,7 +327,7 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight
     for (int p = 0; p < outch; p++)
     {
         int32_t* out0 = output_int32 + p * out_hw;
-        int8_t* kernel0 = (int8_t* )kernel + p * inch * 9;
+        int8_t* kernel0 = (int8_t*)kernel + p * inch * 9;
 
         for (int q = 0; q < inch; q++)
         {
@@ -348,15 +347,15 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight
                 {
                     int sum0 = 0;
 
-                    sum0 += ( int )r0[0] * kernel0[0];
-                    sum0 += ( int )r0[1] * kernel0[1];
-                    sum0 += ( int )r0[2] * kernel0[2];
-                    sum0 += ( int )r1[0] * kernel0[3];
-                    sum0 += ( int )r1[1] * kernel0[4];
-                    sum0 += ( int )r1[2] * kernel0[5];
-                    sum0 += ( int )r2[0] * kernel0[6];
-                    sum0 += ( int )r2[1] * kernel0[7];
-                    sum0 += ( int )r2[2] * kernel0[8];
+                    sum0 += (int)r0[0] * kernel0[0];
+                    sum0 += (int)r0[1] * kernel0[1];
+                    sum0 += (int)r0[2] * kernel0[2];
+                    sum0 += (int)r1[0] * kernel0[3];
+                    sum0 += (int)r1[1] * kernel0[4];
+                    sum0 += (int)r1[2] * kernel0[5];
+                    sum0 += (int)r2[0] * kernel0[6];
+                    sum0 += (int)r2[1] * kernel0[7];
+                    sum0 += (int)r2[2] * kernel0[8];
 
                     *outptr0 += sum0;
 
@@ -383,9 +382,9 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight
         {
             int output_off = i * (outh * outw) + j;
             if (bias_tensor)
-                output_fp32[output_off] = (float )(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
+                output_fp32[output_off] = (float)(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
             else
-                output_fp32[output_off] = (float )output_int32[output_off] * input_scale * kernel_scales[i];
+                output_fp32[output_off] = (float)output_int32[output_off] * input_scale * kernel_scales[i];
         }
     }
 
@@ -431,7 +430,7 @@ static int conv3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight
         {
             int output_off = i * (outh * outw) + j;
 
-            int32_t data_i32 = ( int32_t )(round(output_fp32[output_off] / output_scale));
+            int32_t data_i32 = (int32_t)(round(output_fp32[output_off] / output_scale));
             if (data_i32 > 127)
                 data_i32 = 127;
             else if (data_i32 < -127)
@@ -466,19 +465,19 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     int ret = -1;
-    switch(conv_param->stride_h)
+    switch (conv_param->stride_h)
     {
-        case 1:
-            ret = conv3x3s1_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread);
-            break;
-        case 2:
-            ret = conv3x3s2_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread);
-            break;
-        default:
-            TLOG_ERR("Direct Convolution Int8 not support the stride %d\n", conv_param->stride_h);
+    case 1:
+        ret = conv3x3s1_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread);
+        break;
+    case 2:
+        ret = conv3x3s2_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread);
+        break;
+    default:
+        TLOG_ERR("Direct Convolution Int8 not support the stride %d\n", conv_param->stride_h);
     }
 
     return ret;
@@ -496,7 +495,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
-    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
     struct node* ir_node = exec_node;
     struct graph* ir_graph = ir_node->graph;
 
@@ -520,8 +519,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     if (input_tensor->data_type != TENGINE_DT_INT8)
         return 0;
 
-    if (group == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 &&
-        ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
+    if (group == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
         return OPS_SCORE_BEST * 2;
     else
         return 0;
diff --git a/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c b/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c
index 6dfdb8fd1..b94bcb363 100644
--- a/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c
+++ b/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c
@@ -39,7 +39,6 @@
 #include <math.h>
 #include <string.h>
 
-
 static void pad_int8(int8_t* input, int8_t* output, int in_h, int in_w, int out_h, int out_w, int top, int left, int8_t v)
 {
     int8_t* ptr = input;
@@ -96,7 +95,7 @@ static void pad_int8(int8_t* input, int8_t* output, int in_h, int in_w, int out_
 }
 
 static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor,
-                               struct tensor* output_tensor, struct conv_param* param, int num_thread)
+                                struct tensor* output_tensor, struct conv_param* param, int num_thread)
 {
     int inch = input_tensor->dims[1];
     int inh = input_tensor->dims[2];
@@ -117,9 +116,9 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig
     float* output_fp32 = (float*)sys_malloc(out_size * sizeof(float));
 
     int8_t* output_int8 = (int8_t*)output_tensor->data;
-    int8_t* input_int8  = (int8_t*)input_tensor->data;
+    int8_t* input_int8 = (int8_t*)input_tensor->data;
     int32_t* bias_int32 = NULL;
-    if(bias_tensor)
+    if (bias_tensor)
         bias_int32 = (int32_t*)bias_tensor->data;
 
     /* get scale value of quantizaiton */
@@ -137,7 +136,7 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig
         input_tmp = input_int8;
     else
     {
-        input_tmp = ( int8_t* )sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t));
+        input_tmp = (int8_t*)sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t));
 #pragma omp parallel for num_threads(num_thread)
         for (int g = 0; g < inch; g++)
         {
@@ -151,7 +150,7 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig
     for (int p = 0; p < outch; p++)
     {
         int32_t* out0 = output_int32 + p * out_hw;
-        int8_t* kernel0 = (int8_t* )kernel + p * 9;
+        int8_t* kernel0 = (int8_t*)kernel + p * 9;
 
         int* outptr0 = out0;
 
@@ -169,15 +168,15 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig
             {
                 int sum0 = 0;
 
-                sum0 += ( int )r0[0] * kernel0[0];
-                sum0 += ( int )r0[1] * kernel0[1];
-                sum0 += ( int )r0[2] * kernel0[2];
-                sum0 += ( int )r1[0] * kernel0[3];
-                sum0 += ( int )r1[1] * kernel0[4];
-                sum0 += ( int )r1[2] * kernel0[5];
-                sum0 += ( int )r2[0] * kernel0[6];
-                sum0 += ( int )r2[1] * kernel0[7];
-                sum0 += ( int )r2[2] * kernel0[8];
+                sum0 += (int)r0[0] * kernel0[0];
+                sum0 += (int)r0[1] * kernel0[1];
+                sum0 += (int)r0[2] * kernel0[2];
+                sum0 += (int)r1[0] * kernel0[3];
+                sum0 += (int)r1[1] * kernel0[4];
+                sum0 += (int)r1[2] * kernel0[5];
+                sum0 += (int)r2[0] * kernel0[6];
+                sum0 += (int)r2[1] * kernel0[7];
+                sum0 += (int)r2[2] * kernel0[8];
 
                 *outptr0 += sum0;
 
@@ -203,9 +202,9 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig
         {
             int output_off = i * (outh * outw) + j;
             if (bias_tensor)
-                output_fp32[output_off] = (float )(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
+                output_fp32[output_off] = (float)(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
             else
-                output_fp32[output_off] = (float )output_int32[output_off] * input_scale * kernel_scales[i];
+                output_fp32[output_off] = (float)output_int32[output_off] * input_scale * kernel_scales[i];
         }
     }
 
@@ -251,7 +250,7 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig
         {
             int output_off = i * (outh * outw) + j;
 
-            int32_t data_i32 = ( int32_t )(round(output_fp32[output_off] / output_scale));
+            int32_t data_i32 = (int32_t)(round(output_fp32[output_off] / output_scale));
             if (data_i32 > 127)
                 data_i32 = 127;
             else if (data_i32 < -127)
@@ -269,9 +268,8 @@ static int convdw3x3s1_int8_sse(struct tensor* input_tensor, struct tensor* weig
     return 0;
 }
 
-
 static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor,
-                              struct tensor* output_tensor, struct conv_param* param, int num_thread)
+                                struct tensor* output_tensor, struct conv_param* param, int num_thread)
 {
     int inch = input_tensor->dims[1];
     int inh = input_tensor->dims[2];
@@ -292,9 +290,9 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig
     float* output_fp32 = (float*)sys_malloc(out_size * sizeof(float));
 
     int8_t* output_int8 = (int8_t*)output_tensor->data;
-    int8_t* input_int8  = (int8_t*)input_tensor->data;
+    int8_t* input_int8 = (int8_t*)input_tensor->data;
     int32_t* bias_int32 = NULL;
-    if(bias_tensor)
+    if (bias_tensor)
         bias_int32 = (int32_t*)bias_tensor->data;
 
     /* get scale value of quantizaiton */
@@ -312,8 +310,8 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig
         input_tmp = input_int8;
     else
     {
-        input_tmp = ( int8_t* )sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t));
-#pragma omp parallel for num_threads(num_thread)        
+        input_tmp = (int8_t*)sys_malloc((size_t)inh_tmp * inw_tmp * inch * sizeof(int8_t));
+#pragma omp parallel for num_threads(num_thread)
         for (int g = 0; g < inch; g++)
         {
             int8_t* pad_in = input_int8 + g * inh * inw;
@@ -328,7 +326,7 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig
     for (int p = 0; p < outch; p++)
     {
         int32_t* out0 = output_int32 + p * out_hw;
-        int8_t* kernel0 = (int8_t* )kernel + p * 9;
+        int8_t* kernel0 = (int8_t*)kernel + p * 9;
 
         int* outptr0 = out0;
 
@@ -346,15 +344,15 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig
             {
                 int sum0 = 0;
 
-                sum0 += ( int )r0[0] * kernel0[0];
-                sum0 += ( int )r0[1] * kernel0[1];
-                sum0 += ( int )r0[2] * kernel0[2];
-                sum0 += ( int )r1[0] * kernel0[3];
-                sum0 += ( int )r1[1] * kernel0[4];
-                sum0 += ( int )r1[2] * kernel0[5];
-                sum0 += ( int )r2[0] * kernel0[6];
-                sum0 += ( int )r2[1] * kernel0[7];
-                sum0 += ( int )r2[2] * kernel0[8];
+                sum0 += (int)r0[0] * kernel0[0];
+                sum0 += (int)r0[1] * kernel0[1];
+                sum0 += (int)r0[2] * kernel0[2];
+                sum0 += (int)r1[0] * kernel0[3];
+                sum0 += (int)r1[1] * kernel0[4];
+                sum0 += (int)r1[2] * kernel0[5];
+                sum0 += (int)r2[0] * kernel0[6];
+                sum0 += (int)r2[1] * kernel0[7];
+                sum0 += (int)r2[2] * kernel0[8];
 
                 *outptr0 += sum0;
 
@@ -380,9 +378,9 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig
         {
             int output_off = i * (outh * outw) + j;
             if (bias_tensor)
-                output_fp32[output_off] = (float )(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
+                output_fp32[output_off] = (float)(output_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
             else
-                output_fp32[output_off] = (float )output_int32[output_off] * input_scale * kernel_scales[i];
+                output_fp32[output_off] = (float)output_int32[output_off] * input_scale * kernel_scales[i];
         }
     }
 
@@ -428,7 +426,7 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig
         {
             int output_off = i * (outh * outw) + j;
 
-            int32_t data_i32 = ( int32_t )(round(output_fp32[output_off] / output_scale));
+            int32_t data_i32 = (int32_t)(round(output_fp32[output_off] / output_scale));
             if (data_i32 > 127)
                 data_i32 = 127;
             else if (data_i32 < -127)
@@ -447,19 +445,19 @@ static int convdw3x3s2_int8_sse(struct tensor* input_tensor, struct tensor* weig
 }
 
 static int conv_dw_run_int8(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor,
-                               struct tensor* output_tensor, struct conv_param* param, int num_thread)
+                            struct tensor* output_tensor, struct conv_param* param, int num_thread)
 {
     int ret = -1;
-    switch(param->stride_h)
+    switch (param->stride_h)
     {
-        case 1:
-            ret = convdw3x3s1_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, param, num_thread);
-            break;
-        case 2:
-            ret = convdw3x3s2_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, param, num_thread);
-            break;
-        default:
-            TLOG_ERR("Direct Convolution Int8 not support the stride %d\n", param->stride_h);
+    case 1:
+        ret = convdw3x3s1_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, param, num_thread);
+        break;
+    case 2:
+        ret = convdw3x3s2_int8_sse(input_tensor, weight_tensor, bias_tensor, output_tensor, param, num_thread);
+        break;
+    default:
+        TLOG_ERR("Direct Convolution Int8 not support the stride %d\n", param->stride_h);
     }
 
     return ret;
@@ -480,8 +478,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (ir_node->input_num > 2)
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     int ret = -1;
     if (exec_graph->mode == TENGINE_MODE_FP32)
@@ -490,8 +488,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         ret = conv_dw_run_int8(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_param, num_thread);
     else
     {
-            TLOG_ERR("hcl conv run failed\n");
-            return -1;
+        TLOG_ERR("hcl conv run failed\n");
+        return -1;
     }
 
     return ret;
@@ -509,7 +507,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
-    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
     struct node* ir_node = exec_node;
     struct graph* ir_graph = ir_node->graph;
 
@@ -538,8 +536,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     if (kernel_h != kernel_w || input_tensor->dims[0] > 1)
         return 0;
 
-    if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 &&
-        ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
+    if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
         return OPS_SCORE_BEST;
     else
         return 0;
diff --git a/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.c b/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.c
index 8da88c902..45ec0536c 100644
--- a/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.c
+++ b/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.c
@@ -46,7 +46,6 @@
 #include <immintrin.h>
 #endif
 
-
 #define max(a, b) ((a) > (b) ? (a) : (b))
 #define min(a, b) ((a) < (b) ? (a) : (b))
 
@@ -54,11 +53,11 @@ static void relu(float* data, int size, int activation)
 {
     for (int i = 0; i < size; i++)
     {
-        data[i] = max(data[i], ( float )0);
+        data[i] = max(data[i], (float)0);
 
         if (activation > 0)
         {
-            data[i] = min(data[i], ( float )activation);
+            data[i] = min(data[i], (float)activation);
         }
     }
 }
@@ -127,9 +126,9 @@ static void convdw3x3s1(float* output, float* img_data, float* kernel_data, floa
     int channel_count = inc >> 3;
     int channel_remain = inc - (channel_count << 3);
     // generate the image tmp
-    float* img_tmp = ( float* )sys_malloc(8 * (unsigned long)inwh * (channel_count + 1) * sizeof(float));
-    float* kernel_tmp = ( float* )sys_malloc(8 * 9 * (channel_count + 1) * sizeof(float));
-    float* bias_tmp = ( float* )sys_malloc(8 * (channel_count + 1) * sizeof(float));
+    float* img_tmp = (float*)sys_malloc(8 * (unsigned long)inwh * (channel_count + 1) * sizeof(float));
+    float* kernel_tmp = (float*)sys_malloc(8 * 9 * (channel_count + 1) * sizeof(float));
+    float* bias_tmp = (float*)sys_malloc(8 * (channel_count + 1) * sizeof(float));
     {
         for (int i = 0; i < channel_count; i++)
         {
@@ -334,7 +333,7 @@ static void convdw3x3s1(float* output, float* img_data, float* kernel_data, floa
         }
     }
 
-    float* output_tmp = ( float* )sys_malloc((unsigned long)outwh * (channel_count + 1) * 8 * sizeof(float));
+    float* output_tmp = (float*)sys_malloc((unsigned long)outwh * (channel_count + 1) * 8 * sizeof(float));
     for (int c = 0; c < channel_count + 1; c++)
     {
         float* ktmp = kernel_tmp + c * 8 * 9;
@@ -783,9 +782,9 @@ static void convdw3x3s2(float* output, float* img_data, float* kernel_data, floa
     int channel_count = inc >> 3;
     int channel_remain = inc - (channel_count << 3);
     // generate the image tmp
-    float* img_tmp = ( float* )sys_malloc(8 * (unsigned long)inwh * (channel_count + 1) * sizeof(float));
-    float* kernel_tmp = ( float* )sys_malloc(8 * 9 * (channel_count + 1) * sizeof(float));
-    float* bias_tmp = ( float* )sys_malloc(8 * (channel_count + 1) * sizeof(float));
+    float* img_tmp = (float*)sys_malloc(8 * (unsigned long)inwh * (channel_count + 1) * sizeof(float));
+    float* kernel_tmp = (float*)sys_malloc(8 * 9 * (channel_count + 1) * sizeof(float));
+    float* bias_tmp = (float*)sys_malloc(8 * (channel_count + 1) * sizeof(float));
     {
         for (int i = 0; i < channel_count; i++)
         {
@@ -993,7 +992,7 @@ static void convdw3x3s2(float* output, float* img_data, float* kernel_data, floa
         }
     }
 
-    float* output_tmp = ( float* )sys_malloc((unsigned long)outwh * (channel_count + 1) * 8 * sizeof(float));
+    float* output_tmp = (float*)sys_malloc((unsigned long)outwh * (channel_count + 1) * 8 * sizeof(float));
     for (int c = 0; c < channel_count + 1; c++)
     {
         float* ktmp = kernel_tmp + c * 8 * 9;
@@ -1310,9 +1309,9 @@ static void convdw3x3s1(float* output, float* img_data, float* kernel_data, floa
     int channel_remain = inc - (channel_count << 2);
 
     // generate the image tmp
-    float* img_tmp = ( float* )sys_malloc(4 * inwh * (channel_count + 1) * sizeof(float));
-    float* kernel_tmp = ( float* )sys_malloc(4 * 9 * (channel_count + 1) * sizeof(float));
-    float* bias_tmp = ( float* )sys_malloc(4 * (channel_count + 1) * sizeof(float));
+    float* img_tmp = (float*)sys_malloc(4 * inwh * (channel_count + 1) * sizeof(float));
+    float* kernel_tmp = (float*)sys_malloc(4 * 9 * (channel_count + 1) * sizeof(float));
+    float* bias_tmp = (float*)sys_malloc(4 * (channel_count + 1) * sizeof(float));
     {
         for (int i = 0; i < channel_count; i++)
         {
@@ -1416,7 +1415,7 @@ static void convdw3x3s1(float* output, float* img_data, float* kernel_data, floa
             }
         }
     }
-    float* output_tmp = ( float* )sys_malloc(outwh * 4 * (channel_count + 1) * sizeof(float));
+    float* output_tmp = (float*)sys_malloc(outwh * 4 * (channel_count + 1) * sizeof(float));
 
     for (int c = 0; c < channel_count + 1; c++)
     {
@@ -1951,9 +1950,9 @@ static void convdw3x3s2(float* output, float* img_data, float* kernel_data, floa
     int channel_count = inc >> 2;
     int channel_remain = inc - (channel_count << 2);
     // generate the image tmp
-    float* img_tmp = ( float* )sys_malloc(4 * inwh * (channel_count + 1) * sizeof(float));
-    float* kernel_tmp = ( float* )sys_malloc(4 * 9 * (channel_count + 1) * sizeof(float));
-    float* bias_tmp = ( float* )sys_malloc(4 * (channel_count + 1) * sizeof(float));
+    float* img_tmp = (float*)sys_malloc(4 * inwh * (channel_count + 1) * sizeof(float));
+    float* kernel_tmp = (float*)sys_malloc(4 * 9 * (channel_count + 1) * sizeof(float));
+    float* bias_tmp = (float*)sys_malloc(4 * (channel_count + 1) * sizeof(float));
     {
         for (int i = 0; i < channel_count; i++)
         {
@@ -2057,7 +2056,7 @@ static void convdw3x3s2(float* output, float* img_data, float* kernel_data, floa
             }
         }
     }
-    float* output_tmp = ( float* )sys_malloc(outwh * 4 * (channel_count + 1) * sizeof(float));
+    float* output_tmp = (float*)sys_malloc(outwh * 4 * (channel_count + 1) * sizeof(float));
     for (int c = 0; c < channel_count + 1; c++)
     {
         float* ktmp = kernel_tmp + c * 4 * 9;
@@ -2525,12 +2524,12 @@ static void convdw3x3s2(float* output, float* input, float* _kernel, float* _bia
 int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor,
                 struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity)
 {
-    float* input = ( float* )input_tensor->data;
-    float* output = ( float* )output_tensor->data;
-    float* kernel = ( float* )weight_tensor->data;
+    float* input = (float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
+    float* kernel = (float*)weight_tensor->data;
     float* biases = NULL;
     if (bias_tensor)
-        biases = ( float* )bias_tensor->data;
+        biases = (float*)bias_tensor->data;
 
     int batch_number = input_tensor->dims[0];
     int inc = input_tensor->dims[1];
@@ -2565,8 +2564,8 @@ int conv_dw_run(struct tensor* input_tensor, struct tensor* weight_tensor, struc
         input_tmp = input;
     else
     {
-        input_tmp = ( float* )sys_malloc((size_t)inh_tmp * inw_tmp * group * sizeof(float));
-#pragma omp parallel for num_threads(num_thread)        
+        input_tmp = (float*)sys_malloc((size_t)inh_tmp * inw_tmp * group * sizeof(float));
+#pragma omp parallel for num_threads(num_thread)
         for (int g = 0; g < group; g++)
         {
             float* pad_in = input + g * inh * inw;
diff --git a/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.h b/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.h
index 665f832f9..85b6ad3ea 100644
--- a/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.h
+++ b/source/device/cpu/op/conv/x86/conv_dw_kernel_x86.h
@@ -31,7 +31,6 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int conv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
                 struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, int num_thread, int cpu_affinity);
 
diff --git a/source/device/cpu/op/conv/x86/conv_hcl_x86.c b/source/device/cpu/op/conv/x86/conv_hcl_x86.c
index 7215a1bd7..b1a3cf689 100644
--- a/source/device/cpu/op/conv/x86/conv_hcl_x86.c
+++ b/source/device/cpu/op/conv/x86/conv_hcl_x86.c
@@ -38,7 +38,6 @@
 
 #include <string.h>
 
-
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -47,8 +46,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* get cpu affinity */
     conv_priv_info->cpu_type = exec_graph->cpu_affinity;
@@ -67,7 +66,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
         if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size)
         {
             if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem,
-                                              exec_graph->shared_pack4_mem_size) < 0)
+                                              exec_graph->shared_pack4_mem_size)
+                < 0)
             {
                 TLOG_ERR("hcl conv: set shared pack4 memory failed\n");
 
@@ -119,14 +119,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (ir_node->input_num > 2)
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 run */
     if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8 || exec_graph->mode == TENGINE_MODE_INT8)
     {
         if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread,
-                         cpu_affinity) < 0)
+                         cpu_affinity)
+            < 0)
         {
             TLOG_ERR("hcl conv run failed\n");
             return -1;
@@ -150,7 +151,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     /* dynamic get the shape of output tensor */
     int n = input_tensor->dims[0];
@@ -212,10 +213,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     }
     else
     {
-        out_h =
-            (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) /
-            conv_param->stride_h +
-            1;
+        out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / conv_param->stride_h + 1;
     }
 
     if (conv_param->pad_w0 < 0)
@@ -238,10 +236,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     }
     else
     {
-        out_w =
-            (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) /
-            conv_param->stride_w +
-            1;
+        out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / conv_param->stride_w + 1;
     }
 
     int dims[4];
@@ -254,7 +249,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
             dims[2] = out_h;
             dims[3] = out_w;
 
-            for (int i=0; i<4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 if (dims[i] == 0)
                     dims[i] = 1;
@@ -271,7 +266,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
             dims[2] = out_w;
             dims[3] = out_c;
 
-            for (int i=0; i<4; i++)
+            for (int i = 0; i < 4; i++)
             {
                 if (dims[i] == 0)
                     dims[i] = 1;
@@ -286,10 +281,10 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
 
     /* fp32 postrun */
-    if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8 || exec_graph->mode == TENGINE_MODE_INT8 )
+    if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8 || exec_graph->mode == TENGINE_MODE_INT8)
     {
         if (conv_hcl_postrun(conv_priv_info) < 0)
         {
@@ -318,10 +313,10 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
     filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
     /* init the private info data of convolution op */
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info));
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info));
     if (conv_priv_info == NULL)
     {
         return -1;
@@ -346,7 +341,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
 
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
     sys_free(conv_priv_info);
     exec_node->ops_priv = NULL;
 
@@ -359,7 +354,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
     int group = param->group;
     int kernel_h = param->kernel_h;
     int kernel_w = param->kernel_w;
@@ -381,8 +376,7 @@ static struct node_ops hcl_node_ops = {.prerun = prerun,
                                        .postrun = postrun,
                                        .init_node = init_node,
                                        .release_node = release_node,
-                                       .score = score
-};
+                                       .score = score};
 
 int register_conv_hcl_x86_op()
 {
diff --git a/source/device/cpu/op/conv/x86/conv_kernel_x86.c b/source/device/cpu/op/conv/x86/conv_kernel_x86.c
index 763fad86f..ee90eee1c 100644
--- a/source/device/cpu/op/conv/x86/conv_kernel_x86.c
+++ b/source/device/cpu/op/conv/x86/conv_kernel_x86.c
@@ -51,13 +51,12 @@
 #define max(a, b) ((a) > (b) ? (a) : (b))
 #define min(a, b) ((a) < (b) ? (a) : (b))
 
-
 static int get_private_mem_size(struct tensor* filter)
 {
-    if (filter->data_type == TENGINE_DT_UINT8)    // simulator uint8 inference with fp32
+    if (filter->data_type == TENGINE_DT_UINT8) // simulator uint8 inference with fp32
         return filter->elem_num * filter->elem_size * 4;
     else
-        return filter->elem_num * filter->elem_size;    // caution
+        return filter->elem_num * filter->elem_size; // caution
 }
 
 static void interleave(struct tensor* filter, struct conv_priv_info* priv_info)
@@ -69,7 +68,7 @@ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info)
 static void interleave_uint8(struct tensor* filter, struct conv_priv_info* priv_info)
 {
     /* dequant uint8 weight to fp32 for simulator */
-    float* weight_fp32 = (float* )priv_info->interleave_buffer;
+    float* weight_fp32 = (float*)priv_info->interleave_buffer;
     uint8_t* weight_uint8 = (uint8_t*)filter->data;
     float scale = filter->scale;
     int zero_point = filter->zero_point;
@@ -81,7 +80,7 @@ static void interleave_uint8(struct tensor* filter, struct conv_priv_info* priv_
 }
 
 void im2col_fp32(float* data_img, float* data_col, int inh, int inw, int inc, int outh, int outw, int ksize_h,
-            int ksize_w, int sh, int sw, int ph, int pw, int dh, int dw)
+                 int ksize_w, int sh, int sw, int ph, int pw, int dh, int dw)
 {
     const int channels_col = ksize_h * ksize_w * inc;
 
@@ -163,7 +162,7 @@ void im2col_uint8(uint8_t* data_img, float* data_col, struct tensor* input_tenso
 
             if (im_row >= 0 && im_row < inh)
             {
-                uint8_t * in = data_img + inw * (im_row + inh * c_) + im_col + (w_low - 1) * sw;
+                uint8_t* in = data_img + inw * (im_row + inh * c_) + im_col + (w_low - 1) * sw;
 
                 memset(out, 0, w_low * sizeof(float));
                 out += w_low;
@@ -218,12 +217,12 @@ void im2col_int8(int8_t* data_img, int8_t* data_col, struct tensor* input_tensor
         for (int h = 0; h < outh; ++h)
         {
             const int im_row = kh * dh + h * sh - ph;
-            int8_t * out = data_col + (c * outh + h) * outw;
-            const int8_t * end = out + w_high;
+            int8_t* out = data_col + (c * outh + h) * outw;
+            const int8_t* end = out + w_high;
 
             if (im_row >= 0 && im_row < inh)
             {
-                int8_t * in = data_img + inw * (im_row + inh * c_) + im_col + (w_low - 1) * sw;
+                int8_t* in = data_img + inw * (im_row + inh * c_) + im_col + (w_low - 1) * sw;
                 memset(out, 0, w_low * sizeof(int8_t));
                 out += w_low;
                 while (out < end)
@@ -249,8 +248,8 @@ static void im2col_ir(struct tensor* input, struct tensor* output, struct conv_p
     int image_size = input->dims[1] * input->dims[2] * input->dims[3];
     int group_size = input_chan * input->dims[2] * input->dims[3];
 
-	void* input_base = (void*)((uint8_t*)input->data + (n * image_size + group * group_size) * input->elem_size);
-	void* im2col_buf = (void*)priv_info->im2col_buffer;
+    void* input_base = (void*)((uint8_t*)input->data + (n * image_size + group * group_size) * input->elem_size);
+    void* im2col_buf = (void*)priv_info->im2col_buffer;
 
     if (input->data_type == TENGINE_DT_FP32)
     {
@@ -297,7 +296,7 @@ void input_pack4_fp32(int K, int N, float* pB, float* pB_t, int num_thread)
             tmp[5] = img[5];
             tmp[6] = img[6];
             tmp[7] = img[7];
-#endif    // __SSE__
+#endif // __SSE__
             tmp += 8;
             img += N;
         }
@@ -333,7 +332,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
     {
         int i = pp * 8;
 
-        float* output0 = pC + ( i )*N;
+        float* output0 = pC + (i)*N;
         float* output1 = pC + (i + 1) * N;
         float* output2 = pC + (i + 2) * N;
         float* output3 = pC + (i + 3) * N;
@@ -369,18 +368,18 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m256 _vb1 = _mm256_loadu_ps(vb + 8);
                 __m256 _vb2 = _mm256_loadu_ps(vb + 16);
                 __m256 _vb3 = _mm256_loadu_ps(vb + 24);
-                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0);    // sum0 = (a00-a07) * k00
-                _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1);    // sum1 = (a00-a07) * k10
-                _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2);    // sum2 = (a00-a07) * k20
-                _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3);    // sum3 = (a00-a07) * k30
+                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00
+                _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10
+                _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20
+                _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30
                 _va0 = _mm256_broadcast_ss(va + 4);
                 _va1 = _mm256_broadcast_ss(va + 5);
                 _va2 = _mm256_broadcast_ss(va + 6);
                 _va3 = _mm256_broadcast_ss(va + 7);
-                _sum4 = _mm256_fmadd_ps(_vb0, _va0, _sum4);    // sum4 = (a00-a07) * k40
-                _sum5 = _mm256_fmadd_ps(_vb0, _va1, _sum5);    // sum5 = (a00-a07) * k50
-                _sum6 = _mm256_fmadd_ps(_vb0, _va2, _sum6);    // sum6 = (a00-a07) * k60
-                _sum7 = _mm256_fmadd_ps(_vb0, _va3, _sum7);    // sum7 = (a00-a07) * k70
+                _sum4 = _mm256_fmadd_ps(_vb0, _va0, _sum4); // sum4 = (a00-a07) * k40
+                _sum5 = _mm256_fmadd_ps(_vb0, _va1, _sum5); // sum5 = (a00-a07) * k50
+                _sum6 = _mm256_fmadd_ps(_vb0, _va2, _sum6); // sum6 = (a00-a07) * k60
+                _sum7 = _mm256_fmadd_ps(_vb0, _va3, _sum7); // sum7 = (a00-a07) * k70
 
                 va += 8;
 
@@ -389,18 +388,18 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 _va1 = _mm256_broadcast_ss(va + 1);
                 _va2 = _mm256_broadcast_ss(va + 2);
                 _va3 = _mm256_broadcast_ss(va + 3);
-                _sum0 = _mm256_fmadd_ps(_vb1, _va0, _sum0);    // sum0 += (a10-a17) * k01
-                _sum1 = _mm256_fmadd_ps(_vb1, _va1, _sum1);    // sum1 += (a10-a17) * k11
-                _sum2 = _mm256_fmadd_ps(_vb1, _va2, _sum2);    // sum2 += (a10-a17) * k21
-                _sum3 = _mm256_fmadd_ps(_vb1, _va3, _sum3);    // sum3 += (a10-a17) * k31
+                _sum0 = _mm256_fmadd_ps(_vb1, _va0, _sum0); // sum0 += (a10-a17) * k01
+                _sum1 = _mm256_fmadd_ps(_vb1, _va1, _sum1); // sum1 += (a10-a17) * k11
+                _sum2 = _mm256_fmadd_ps(_vb1, _va2, _sum2); // sum2 += (a10-a17) * k21
+                _sum3 = _mm256_fmadd_ps(_vb1, _va3, _sum3); // sum3 += (a10-a17) * k31
                 _va0 = _mm256_broadcast_ss(va + 4);
                 _va1 = _mm256_broadcast_ss(va + 5);
                 _va2 = _mm256_broadcast_ss(va + 6);
                 _va3 = _mm256_broadcast_ss(va + 7);
-                _sum4 = _mm256_fmadd_ps(_vb1, _va0, _sum4);    // sum4 += (a10-a17) * k41
-                _sum5 = _mm256_fmadd_ps(_vb1, _va1, _sum5);    // sum5 += (a10-a17) * k51
-                _sum6 = _mm256_fmadd_ps(_vb1, _va2, _sum6);    // sum6 += (a10-a17) * k61
-                _sum7 = _mm256_fmadd_ps(_vb1, _va3, _sum7);    // sum7 += (a10-a17) * k71
+                _sum4 = _mm256_fmadd_ps(_vb1, _va0, _sum4); // sum4 += (a10-a17) * k41
+                _sum5 = _mm256_fmadd_ps(_vb1, _va1, _sum5); // sum5 += (a10-a17) * k51
+                _sum6 = _mm256_fmadd_ps(_vb1, _va2, _sum6); // sum6 += (a10-a17) * k61
+                _sum7 = _mm256_fmadd_ps(_vb1, _va3, _sum7); // sum7 += (a10-a17) * k71
 
                 va += 8;
 
@@ -409,18 +408,18 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 _va1 = _mm256_broadcast_ss(va + 1);
                 _va2 = _mm256_broadcast_ss(va + 2);
                 _va3 = _mm256_broadcast_ss(va + 3);
-                _sum0 = _mm256_fmadd_ps(_vb2, _va0, _sum0);    // sum0 += (a20-a27) * k02
-                _sum1 = _mm256_fmadd_ps(_vb2, _va1, _sum1);    // sum1 += (a20-a27) * k12
-                _sum2 = _mm256_fmadd_ps(_vb2, _va2, _sum2);    // sum2 += (a20-a27) * k22
-                _sum3 = _mm256_fmadd_ps(_vb2, _va3, _sum3);    // sum3 += (a20-a27) * k32
+                _sum0 = _mm256_fmadd_ps(_vb2, _va0, _sum0); // sum0 += (a20-a27) * k02
+                _sum1 = _mm256_fmadd_ps(_vb2, _va1, _sum1); // sum1 += (a20-a27) * k12
+                _sum2 = _mm256_fmadd_ps(_vb2, _va2, _sum2); // sum2 += (a20-a27) * k22
+                _sum3 = _mm256_fmadd_ps(_vb2, _va3, _sum3); // sum3 += (a20-a27) * k32
                 _va0 = _mm256_broadcast_ss(va + 4);
                 _va1 = _mm256_broadcast_ss(va + 5);
                 _va2 = _mm256_broadcast_ss(va + 6);
                 _va3 = _mm256_broadcast_ss(va + 7);
-                _sum4 = _mm256_fmadd_ps(_vb2, _va0, _sum4);    // sum4 += (a20-a27) * k42
-                _sum5 = _mm256_fmadd_ps(_vb2, _va1, _sum5);    // sum5 += (a20-a27) * k52
-                _sum6 = _mm256_fmadd_ps(_vb2, _va2, _sum6);    // sum6 += (a20-a27) * k62
-                _sum7 = _mm256_fmadd_ps(_vb2, _va3, _sum7);    // sum7 += (a20-a27) * k72
+                _sum4 = _mm256_fmadd_ps(_vb2, _va0, _sum4); // sum4 += (a20-a27) * k42
+                _sum5 = _mm256_fmadd_ps(_vb2, _va1, _sum5); // sum5 += (a20-a27) * k52
+                _sum6 = _mm256_fmadd_ps(_vb2, _va2, _sum6); // sum6 += (a20-a27) * k62
+                _sum7 = _mm256_fmadd_ps(_vb2, _va3, _sum7); // sum7 += (a20-a27) * k72
 
                 va += 8;
 
@@ -429,18 +428,18 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 _va1 = _mm256_broadcast_ss(va + 1);
                 _va2 = _mm256_broadcast_ss(va + 2);
                 _va3 = _mm256_broadcast_ss(va + 3);
-                _sum0 = _mm256_fmadd_ps(_vb3, _va0, _sum0);    // sum0 += (a30-a37) * k03
-                _sum1 = _mm256_fmadd_ps(_vb3, _va1, _sum1);    // sum1 += (a30-a37) * k13
-                _sum2 = _mm256_fmadd_ps(_vb3, _va2, _sum2);    // sum2 += (a30-a37) * k23
-                _sum3 = _mm256_fmadd_ps(_vb3, _va3, _sum3);    // sum3 += (a30-a37) * k33
+                _sum0 = _mm256_fmadd_ps(_vb3, _va0, _sum0); // sum0 += (a30-a37) * k03
+                _sum1 = _mm256_fmadd_ps(_vb3, _va1, _sum1); // sum1 += (a30-a37) * k13
+                _sum2 = _mm256_fmadd_ps(_vb3, _va2, _sum2); // sum2 += (a30-a37) * k23
+                _sum3 = _mm256_fmadd_ps(_vb3, _va3, _sum3); // sum3 += (a30-a37) * k33
                 _va0 = _mm256_broadcast_ss(va + 4);
                 _va1 = _mm256_broadcast_ss(va + 5);
                 _va2 = _mm256_broadcast_ss(va + 6);
                 _va3 = _mm256_broadcast_ss(va + 7);
-                _sum4 = _mm256_fmadd_ps(_vb3, _va0, _sum4);    // sum4 += (a30-a37) * k43
-                _sum5 = _mm256_fmadd_ps(_vb3, _va1, _sum5);    // sum5 += (a30-a37) * k53
-                _sum6 = _mm256_fmadd_ps(_vb3, _va2, _sum6);    // sum6 += (a30-a37) * k63
-                _sum7 = _mm256_fmadd_ps(_vb3, _va3, _sum7);    // sum7 += (a30-a37) * k73
+                _sum4 = _mm256_fmadd_ps(_vb3, _va0, _sum4); // sum4 += (a30-a37) * k43
+                _sum5 = _mm256_fmadd_ps(_vb3, _va1, _sum5); // sum5 += (a30-a37) * k53
+                _sum6 = _mm256_fmadd_ps(_vb3, _va2, _sum6); // sum6 += (a30-a37) * k63
+                _sum7 = _mm256_fmadd_ps(_vb3, _va3, _sum7); // sum7 += (a30-a37) * k73
 
                 va += 8;
                 vb += 32;
@@ -458,14 +457,14 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m256 _va6 = _mm256_broadcast_ss(va + 6);
                 __m256 _va7 = _mm256_broadcast_ss(va + 7);
                 __m256 _vb0 = _mm256_loadu_ps(vb);
-                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0);    // sum0 = (a00-a07) * k00
-                _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1);    // sum1 = (a00-a07) * k10
-                _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2);    // sum2 = (a00-a07) * k20
-                _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3);    // sum3 = (a00-a07) * k30
-                _sum4 = _mm256_fmadd_ps(_vb0, _va4, _sum4);    // sum4 = (a00-a07) * k40
-                _sum5 = _mm256_fmadd_ps(_vb0, _va5, _sum5);    // sum5 = (a00-a07) * k50
-                _sum6 = _mm256_fmadd_ps(_vb0, _va6, _sum6);    // sum6 = (a00-a07) * k60
-                _sum7 = _mm256_fmadd_ps(_vb0, _va7, _sum7);    // sum7 = (a00-a07) * k70
+                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00
+                _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10
+                _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20
+                _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30
+                _sum4 = _mm256_fmadd_ps(_vb0, _va4, _sum4); // sum4 = (a00-a07) * k40
+                _sum5 = _mm256_fmadd_ps(_vb0, _va5, _sum5); // sum5 = (a00-a07) * k50
+                _sum6 = _mm256_fmadd_ps(_vb0, _va6, _sum6); // sum6 = (a00-a07) * k60
+                _sum7 = _mm256_fmadd_ps(_vb0, _va7, _sum7); // sum7 = (a00-a07) * k70
 
                 va += 8;
                 vb += 8;
@@ -518,7 +517,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 output6[n] = sum6[n];
                 output7[n] = sum7[n];
             }
-#endif    // __AVX__
+#endif // __AVX__
             output0 += 8;
             output1 += 8;
             output2 += 8;
@@ -553,10 +552,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m256 _va2 = _mm256_loadu_ps(va + 16);
                 __m256 _va3 = _mm256_loadu_ps(va + 24);
 
-                _sum0 = _mm256_fmadd_ps(_va0, _vb0, _sum0);    // sum0 += (k00-k70) * a00
-                _sum1 = _mm256_fmadd_ps(_va1, _vb1, _sum1);    // sum1 += (k01-k71) * a10
-                _sum2 = _mm256_fmadd_ps(_va2, _vb2, _sum2);    // sum2 += (k02-k72) * a20
-                _sum3 = _mm256_fmadd_ps(_va3, _vb3, _sum3);    // sum3 += (k03-k73) * a30
+                _sum0 = _mm256_fmadd_ps(_va0, _vb0, _sum0); // sum0 += (k00-k70) * a00
+                _sum1 = _mm256_fmadd_ps(_va1, _vb1, _sum1); // sum1 += (k01-k71) * a10
+                _sum2 = _mm256_fmadd_ps(_va2, _vb2, _sum2); // sum2 += (k02-k72) * a20
+                _sum3 = _mm256_fmadd_ps(_va3, _vb3, _sum3); // sum3 += (k03-k73) * a30
 
                 va += 32;
                 vb += 4;
@@ -572,7 +571,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m256 _vb0 = _mm256_broadcast_ss(vb);
                 __m256 _va = _mm256_loadu_ps(va);
 
-                _sum0_7 = _mm256_fmadd_ps(_va, _vb0, _sum0_7);    // sum0 += (k00-k70) * a00
+                _sum0_7 = _mm256_fmadd_ps(_va, _vb0, _sum0_7); // sum0 += (k00-k70) * a00
 
                 va += 8;
                 vb += 1;
@@ -621,7 +620,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
             output5[0] = sum5;
             output6[0] = sum6;
             output7[0] = sum7;
-#endif    // __AVX__
+#endif // __AVX__
             output0++;
             output1++;
             output2++;
@@ -639,7 +638,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
     {
         int i = remain_outch_start + pp * 4;
 
-        float* output0 = pC + ( i )*N;
+        float* output0 = pC + (i)*N;
         float* output1 = pC + (i + 1) * N;
         float* output2 = pC + (i + 2) * N;
         float* output3 = pC + (i + 3) * N;
@@ -667,10 +666,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m256 _vb1 = _mm256_loadu_ps(vb + 8);
                 __m256 _vb2 = _mm256_loadu_ps(vb + 16);
                 __m256 _vb3 = _mm256_loadu_ps(vb + 24);
-                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0);    // sum0 = (a00-a07) * k00
-                _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1);    // sum1 = (a00-a07) * k10
-                _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2);    // sum2 = (a00-a07) * k20
-                _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3);    // sum3 = (a00-a07) * k30
+                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00
+                _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10
+                _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20
+                _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30
 
                 va += 4;
 
@@ -679,10 +678,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 _va1 = _mm256_broadcast_ss(va + 1);
                 _va2 = _mm256_broadcast_ss(va + 2);
                 _va3 = _mm256_broadcast_ss(va + 3);
-                _sum0 = _mm256_fmadd_ps(_vb1, _va0, _sum0);    // sum0 += (a10-a17) * k01
-                _sum1 = _mm256_fmadd_ps(_vb1, _va1, _sum1);    // sum1 += (a10-a17) * k11
-                _sum2 = _mm256_fmadd_ps(_vb1, _va2, _sum2);    // sum2 += (a10-a17) * k21
-                _sum3 = _mm256_fmadd_ps(_vb1, _va3, _sum3);    // sum3 += (a10-a17) * k31
+                _sum0 = _mm256_fmadd_ps(_vb1, _va0, _sum0); // sum0 += (a10-a17) * k01
+                _sum1 = _mm256_fmadd_ps(_vb1, _va1, _sum1); // sum1 += (a10-a17) * k11
+                _sum2 = _mm256_fmadd_ps(_vb1, _va2, _sum2); // sum2 += (a10-a17) * k21
+                _sum3 = _mm256_fmadd_ps(_vb1, _va3, _sum3); // sum3 += (a10-a17) * k31
 
                 va += 4;
 
@@ -691,10 +690,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 _va1 = _mm256_broadcast_ss(va + 1);
                 _va2 = _mm256_broadcast_ss(va + 2);
                 _va3 = _mm256_broadcast_ss(va + 3);
-                _sum0 = _mm256_fmadd_ps(_vb2, _va0, _sum0);    // sum0 += (a20-a27) * k02
-                _sum1 = _mm256_fmadd_ps(_vb2, _va1, _sum1);    // sum1 += (a20-a27) * k12
-                _sum2 = _mm256_fmadd_ps(_vb2, _va2, _sum2);    // sum2 += (a20-a27) * k22
-                _sum3 = _mm256_fmadd_ps(_vb2, _va3, _sum3);    // sum3 += (a20-a27) * k32
+                _sum0 = _mm256_fmadd_ps(_vb2, _va0, _sum0); // sum0 += (a20-a27) * k02
+                _sum1 = _mm256_fmadd_ps(_vb2, _va1, _sum1); // sum1 += (a20-a27) * k12
+                _sum2 = _mm256_fmadd_ps(_vb2, _va2, _sum2); // sum2 += (a20-a27) * k22
+                _sum3 = _mm256_fmadd_ps(_vb2, _va3, _sum3); // sum3 += (a20-a27) * k32
 
                 va += 4;
 
@@ -703,10 +702,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 _va1 = _mm256_broadcast_ss(va + 1);
                 _va2 = _mm256_broadcast_ss(va + 2);
                 _va3 = _mm256_broadcast_ss(va + 3);
-                _sum0 = _mm256_fmadd_ps(_vb3, _va0, _sum0);    // sum0 += (a30-a37) * k03
-                _sum1 = _mm256_fmadd_ps(_vb3, _va1, _sum1);    // sum1 += (a30-a37) * k13
-                _sum2 = _mm256_fmadd_ps(_vb3, _va2, _sum2);    // sum2 += (a30-a37) * k23
-                _sum3 = _mm256_fmadd_ps(_vb3, _va3, _sum3);    // sum3 += (a30-a37) * k33
+                _sum0 = _mm256_fmadd_ps(_vb3, _va0, _sum0); // sum0 += (a30-a37) * k03
+                _sum1 = _mm256_fmadd_ps(_vb3, _va1, _sum1); // sum1 += (a30-a37) * k13
+                _sum2 = _mm256_fmadd_ps(_vb3, _va2, _sum2); // sum2 += (a30-a37) * k23
+                _sum3 = _mm256_fmadd_ps(_vb3, _va3, _sum3); // sum3 += (a30-a37) * k33
 
                 va += 4;
                 vb += 32;
@@ -720,10 +719,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m256 _va2 = _mm256_broadcast_ss(va + 2);
                 __m256 _va3 = _mm256_broadcast_ss(va + 3);
                 __m256 _vb0 = _mm256_loadu_ps(vb);
-                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0);    // sum0 = (a00-a07) * k00
-                _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1);    // sum1 = (a00-a07) * k10
-                _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2);    // sum2 = (a00-a07) * k20
-                _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3);    // sum3 = (a00-a07) * k30
+                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00
+                _sum1 = _mm256_fmadd_ps(_vb0, _va1, _sum1); // sum1 = (a00-a07) * k10
+                _sum2 = _mm256_fmadd_ps(_vb0, _va2, _sum2); // sum2 = (a00-a07) * k20
+                _sum3 = _mm256_fmadd_ps(_vb0, _va3, _sum3); // sum3 = (a00-a07) * k30
 
                 va += 4;
                 vb += 8;
@@ -760,7 +759,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 output2[n] = sum2[n];
                 output3[n] = sum3[n];
             }
-#endif    // __AVX__
+#endif // __AVX__
             output0 += 8;
             output1 += 8;
             output2 += 8;
@@ -790,10 +789,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m128 _va2 = _mm_loadu_ps(va + 8);
                 __m128 _va3 = _mm_loadu_ps(va + 12);
 
-                _sum0 = _mm_fmadd_ps(_va0, _vb0, _sum0);    // sum0 += (k00-k30) * a00
-                _sum1 = _mm_fmadd_ps(_va1, _vb1, _sum1);    // sum1 += (k01-k31) * a10
-                _sum2 = _mm_fmadd_ps(_va2, _vb2, _sum2);    // sum2 += (k02-k32) * a20
-                _sum3 = _mm_fmadd_ps(_va3, _vb3, _sum3);    // sum3 += (k03-k33) * a30
+                _sum0 = _mm_fmadd_ps(_va0, _vb0, _sum0); // sum0 += (k00-k30) * a00
+                _sum1 = _mm_fmadd_ps(_va1, _vb1, _sum1); // sum1 += (k01-k31) * a10
+                _sum2 = _mm_fmadd_ps(_va2, _vb2, _sum2); // sum2 += (k02-k32) * a20
+                _sum3 = _mm_fmadd_ps(_va3, _vb3, _sum3); // sum3 += (k03-k33) * a30
 
                 va += 16;
                 vb += 4;
@@ -809,7 +808,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m128 _vb0 = _mm_set1_ps(vb[0]);
                 __m128 _va = _mm_loadu_ps(va);
 
-                _sum0_3 = _mm_fmadd_ps(_va, _vb0, _sum0_3);    // sum0 += (k00-k30) * a00
+                _sum0_3 = _mm_fmadd_ps(_va, _vb0, _sum0_3); // sum0 += (k00-k30) * a00
 
                 va += 4;
                 vb += 1;
@@ -841,7 +840,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
             output1[0] = sum1;
             output2[0] = sum2;
             output3[0] = sum3;
-#endif    // __AVX__
+#endif // __AVX__
             output0++;
             output1++;
             output2++;
@@ -877,10 +876,10 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m256 _vb2 = _mm256_loadu_ps(vb + 16);
                 __m256 _vb3 = _mm256_loadu_ps(vb + 24);
 
-                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0);    // sum0 = (a00-a07) * k00
-                _sum0 = _mm256_fmadd_ps(_vb1, _va1, _sum0);    // sum0 += (a10-a17) * k01
-                _sum0 = _mm256_fmadd_ps(_vb2, _va2, _sum0);    // sum0 += (a20-a27) * k02
-                _sum0 = _mm256_fmadd_ps(_vb3, _va3, _sum0);    // sum0 += (a30-a37) * k03
+                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00
+                _sum0 = _mm256_fmadd_ps(_vb1, _va1, _sum0); // sum0 += (a10-a17) * k01
+                _sum0 = _mm256_fmadd_ps(_vb2, _va2, _sum0); // sum0 += (a20-a27) * k02
+                _sum0 = _mm256_fmadd_ps(_vb3, _va3, _sum0); // sum0 += (a30-a37) * k03
 
                 va += 4;
                 vb += 32;
@@ -892,7 +891,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
                 __m256 _va0 = _mm256_broadcast_ss(va);
                 __m256 _vb0 = _mm256_loadu_ps(vb);
 
-                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0);    // sum0 = (a00-a07) * k00
+                _sum0 = _mm256_fmadd_ps(_vb0, _va0, _sum0); // sum0 = (a00-a07) * k00
 
                 va += 1;
                 vb += 8;
@@ -917,7 +916,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
             {
                 output[n] = sum[n];
             }
-#endif    // __AVX__
+#endif // __AVX__
             output += 8;
         }
 
@@ -946,7 +945,7 @@ static void sgemm_fp(int M, int N, int K, float* pA_t, float* pB_t, float* pC, i
 #endif
 #else
             float sum0 = 0.f;
-#endif    // __AVX__
+#endif // __AVX__
             for (; k < K; k++)
             {
                 sum0 += va[0] * vb[0];
@@ -1019,7 +1018,7 @@ static void sgemm_i8(int M, int N, int K, int8_t* pA_t, int8_t* pB_t, int32_t* p
     {
         int i = pp * 8;
 
-        int32_t* output0 = pC + ( i )*N;
+        int32_t* output0 = pC + (i)*N;
         int32_t* output1 = pC + (i + 1) * N;
         int32_t* output2 = pC + (i + 2) * N;
         int32_t* output3 = pC + (i + 3) * N;
@@ -1327,7 +1326,7 @@ static void sgemm_i8(int M, int N, int K, int8_t* pA_t, int8_t* pB_t, int32_t* p
     {
         int i = remain_outch_start + pp * 4;
 
-        int32_t* output0 = pC + ( i )*N;
+        int32_t* output0 = pC + (i)*N;
         int32_t* output1 = pC + (i + 1) * N;
         int32_t* output2 = pC + (i + 2) * N;
         int32_t* output3 = pC + (i + 3) * N;
@@ -1641,13 +1640,13 @@ static void sgemm_fp32(struct tensor* input, struct tensor* filter, struct tenso
     int out_w = output->dims[3];
     int out_image_size = output->dims[1] * output->dims[2] * output->dims[3];
 
-    float* interleave_fp32 = ( float* )priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size;
+    float* interleave_fp32 = (float*)priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size;
     float* im2col_pack4_fp32 = (float*)priv_info->im2col_buffer_pack4;
-    float* output_fp32 = ( float* )output->data + n * out_image_size + outchan_g * group * out_h * out_w;
+    float* output_fp32 = (float*)output->data + n * out_image_size + outchan_g * group * out_h * out_w;
     float* bias_fp32 = NULL;
 
     if (bias)
-        bias_fp32 = ( float* )bias->data + outchan_g * group;
+        bias_fp32 = (float*)bias->data + outchan_g * group;
 
     float* filter_sgemm = interleave_fp32;
     float* input_sgemm_pack4 = im2col_pack4_fp32;
@@ -1712,15 +1711,15 @@ static void sgemm_uint8(struct tensor* input, struct tensor* filter, struct tens
     int out_w = output->dims[3];
     int out_image_size = output->dims[1] * output->dims[2] * output->dims[3];
 
-    float* interleave_fp32 = ( float* )priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size;
+    float* interleave_fp32 = (float*)priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size;
     float* im2col_pack4_fp32 = (float*)priv_info->im2col_buffer_pack4;
-    uint8_t * output_uint8 = ( uint8_t* )output->data + n * out_image_size + outchan_g * group * out_h * out_w;
+    uint8_t* output_uint8 = (uint8_t*)output->data + n * out_image_size + outchan_g * group * out_h * out_w;
     int* bias_int32 = NULL;
     float bias_scale = 0.f;
 
     if (bias)
     {
-        bias_int32 = ( int* )bias->data + outchan_g * group;
+        bias_int32 = (int*)bias->data + outchan_g * group;
         bias_scale = input->scale * filter->scale;
     }
 
@@ -1738,7 +1737,7 @@ static void sgemm_uint8(struct tensor* input, struct tensor* filter, struct tens
             for (int j = 0; j < out_h * out_w; j++)
             {
                 int output_off = i * (out_h * out_w) + j;
-                output_sgemm[output_off] += (float )bias_int32[i] * bias_scale;
+                output_sgemm[output_off] += (float)bias_int32[i] * bias_scale;
             }
         }
     }
@@ -1782,7 +1781,7 @@ static void sgemm_uint8(struct tensor* input, struct tensor* filter, struct tens
         {
             int output_off = i * (out_h * out_w) + j;
 
-            int udata = ( int )(round(output_sgemm[output_off] / output->scale) + output->zero_point);
+            int udata = (int)(round(output_sgemm[output_off] / output->scale) + output->zero_point);
             if (udata > 255)
                 udata = 255;
             else if (udata < 0)
@@ -1795,8 +1794,8 @@ static void sgemm_uint8(struct tensor* input, struct tensor* filter, struct tens
 }
 
 static void sgemm_int8(struct tensor* input, struct tensor* filter, struct tensor* bias,
-                        struct tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n,
-                        int group, int num_thread)
+                       struct tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n,
+                       int group, int num_thread)
 {
     int kernel_size = param->kernel_h * param->kernel_w * param->input_channel / param->group;
     int outchan_g = param->output_channel / param->group;
@@ -1805,13 +1804,13 @@ static void sgemm_int8(struct tensor* input, struct tensor* filter, struct tenso
     int out_w = output->dims[3];
     int out_image_size = output->dims[1] * output->dims[2] * output->dims[3];
 
-    int8_t* interleave_int8 = ( int8_t* )priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size;
+    int8_t* interleave_int8 = (int8_t*)priv_info->interleave_buffer_pack4 + outchan_g * group * kernel_size;
     int8_t* im2col_pack4_int8 = (int8_t*)priv_info->im2col_buffer_pack4;
-    int8_t * output_int8 = ( int8_t* )output->data + n * out_image_size + outchan_g * group * out_h * out_w;
-    int32_t * bias_int32 = NULL;
+    int8_t* output_int8 = (int8_t*)output->data + n * out_image_size + outchan_g * group * out_h * out_w;
+    int32_t* bias_int32 = NULL;
 
     if (bias)
-        bias_int32 = ( int* )bias->data + outchan_g * group;
+        bias_int32 = (int*)bias->data + outchan_g * group;
 
     float input_scale = input->scale;
     float* kernel_scales = filter->scale_list;
@@ -1832,9 +1831,9 @@ static void sgemm_int8(struct tensor* input, struct tensor* filter, struct tenso
         {
             int output_off = i * (out_h * out_w) + j;
             if (bias)
-                output_sgemm_fp32[output_off] = (float )(output_sgemm_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
+                output_sgemm_fp32[output_off] = (float)(output_sgemm_int32[output_off] + bias_int32[i]) * input_scale * kernel_scales[i];
             else
-                output_sgemm_fp32[output_off] = (float )output_sgemm_int32[output_off] * input_scale * kernel_scales[i];
+                output_sgemm_fp32[output_off] = (float)output_sgemm_int32[output_off] * input_scale * kernel_scales[i];
         }
     }
 
@@ -1880,7 +1879,7 @@ static void sgemm_int8(struct tensor* input, struct tensor* filter, struct tenso
         {
             int output_off = i * (out_h * out_w) + j;
 
-            int32_t data_i32 = ( int32_t )(round(output_sgemm_fp32[output_off] / output_scale));
+            int32_t data_i32 = (int32_t)(round(output_sgemm_fp32[output_off] / output_scale));
             if (data_i32 > 127)
                 data_i32 = 127;
             else if (data_i32 < -127)
@@ -1909,8 +1908,7 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w)
     if (in_h <= 10 && in_w <= 10)
         return 0;
 
-    if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 ||
-        dilation_w != 1 || input_chan < 16 || output_chan < 16 || output_chan % 16)
+    if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 || dilation_w != 1 || input_chan < 16 || output_chan < 16 || output_chan % 16)
         return 0;
 
     return 1;
@@ -1958,8 +1956,8 @@ int conv_hcl_get_interleave_pack4_size(int M, int K, struct tensor* filter)
 
 void conv_hcl_interleave_pack4_fp32(int M, int K, struct conv_priv_info* priv_info)
 {
-    float* pA = ( float* )priv_info->interleave_buffer;
-    float* pA_t = ( float* )priv_info->interleave_buffer_pack4;
+    float* pA = (float*)priv_info->interleave_buffer;
+    float* pA_t = (float*)priv_info->interleave_buffer_pack4;
 
     int nn_outch = M >> 3;
     int remain_outch_start = nn_outch << 3;
@@ -2048,8 +2046,8 @@ void conv_hcl_interleave_pack4_fp32(int M, int K, struct conv_priv_info* priv_in
 
 void conv_hcl_interleave_pack4_int8(int M, int K, struct conv_priv_info* priv_info)
 {
-    int8_t* pA = ( int8_t * )priv_info->interleave_buffer;
-    int8_t* pA_t = ( int8_t* )priv_info->interleave_buffer_pack4;
+    int8_t* pA = (int8_t*)priv_info->interleave_buffer;
+    int8_t* pA_t = (int8_t*)priv_info->interleave_buffer_pack4;
 
     int nn_outch = M >> 3;
     int remain_outch_start = nn_outch << 3;
@@ -2217,8 +2215,7 @@ int conv_hcl_postrun(struct conv_priv_info* priv_info)
         return wino_conv_hcl_postrun(priv_info);
     }
 
-    if (priv_info->external_interleave_pack4_mem && !priv_info->external_interleave_mem &&
-        priv_info->interleave_buffer != NULL)
+    if (priv_info->external_interleave_pack4_mem && !priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL)
     {
         sys_free(priv_info->interleave_buffer_pack4);
         priv_info->interleave_buffer_pack4 = NULL;
@@ -2256,7 +2253,7 @@ int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, stru
                                  cpu_affinity);
     }
 
-    for (int i = 0; i < input_tensor->dims[0]; i++)    // batch size
+    for (int i = 0; i < input_tensor->dims[0]; i++) // batch size
     {
         for (int j = 0; j < group; j++)
         {
diff --git a/source/device/cpu/op/conv/x86/conv_kernel_x86.h b/source/device/cpu/op/conv/x86/conv_kernel_x86.h
index 03237b896..8be2524c9 100644
--- a/source/device/cpu/op/conv/x86/conv_kernel_x86.h
+++ b/source/device/cpu/op/conv/x86/conv_kernel_x86.h
@@ -31,7 +31,6 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 /* float32 */
 int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
                     struct conv_priv_info* info, struct conv_param* param);
diff --git a/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.c b/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.c
index e6355de2c..01c1169a6 100644
--- a/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.c
+++ b/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.c
@@ -39,7 +39,7 @@
 #include <string.h>
 #include <math.h>
 
-#define TILE 4
+#define TILE      4
 #define ELEM_SIZE ((TILE + 2) * (TILE + 2))
 
 #define WINO_MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -49,11 +49,11 @@ static void relu(float* data, int size, int activation)
 {
     for (int i = 0; i < size; i++)
     {
-        data[i] = WINO_MAX(data[i], ( float )0);
+        data[i] = WINO_MAX(data[i], (float)0);
 
         if (activation > 0)
         {
-            data[i] = WINO_MIN(data[i], ( float )activation);
+            data[i] = WINO_MIN(data[i], (float)activation);
         }
     }
 }
@@ -62,7 +62,7 @@ static int get_private_mem_size(struct tensor* filter, struct conv_param* param)
     int output_c = filter->dims[0];
     int input_c = filter->dims[1];
     int trans_ker_size = (unsigned long)output_c * input_c * ELEM_SIZE * sizeof(float);
-    return trans_ker_size + 128;    // caution
+    return trans_ker_size + 128; // caution
 }
 
 static void pad_0_align_2D(float* dst, float* src, int m, int n, int m_align, int n_align, int pad_h, int pad_w)
@@ -144,7 +144,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
         int w_tm = outw_align / 4 * 6;
         int h_tm = outh_align / 4 * 6;
 
-        int nColBlocks = h_tm / 6;    // may be the block num in Feathercnn
+        int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
         int nRowBlocks = w_tm / 6;
 
         const int tiles = nColBlocks * nRowBlocks;
@@ -527,7 +527,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
                         out_tm8[2] = d5[4];
                         out_tm8[3] = d5[5];
                     }
-#endif    // __AVX__
+#endif // __AVX__
                     r0 += 4;
                     r1 += 4;
                     r2 += 4;
@@ -545,7 +545,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
         int w_tm = outw_align / 4 * 6;
         int h_tm = outh_align / 4 * 6;
 
-        int nColBlocks = h_tm / 6;    // may be the block num in Feathercnn
+        int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
         int nRowBlocks = w_tm / 6;
 
         const int tiles = nColBlocks * nRowBlocks;
@@ -815,7 +815,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
                         output6_tm[n] = sum6[n];
                         output7_tm[n] = sum7[n];
                     }
-#endif    // __AVX__
+#endif // __AVX__
                     output0_tm += 36;
                     output1_tm += 36;
                     output2_tm += 36;
@@ -911,7 +911,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
                         output2_tm[n] = sum2[n];
                         output3_tm[n] = sum3[n];
                     }
-#endif    // __AVX__
+#endif // __AVX__
                     output0_tm += 36;
                     output1_tm += 36;
                     output2_tm += 36;
@@ -929,8 +929,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
 
                 for (int i = 0; i < tiles; i++)
                 {
-                    const float* kptr =
-                        kernel_tm_test + 4 * r * inch * outch + (p / 8 + (p % 8) / 4 + p % 4) * inch * 4;
+                    const float* kptr = kernel_tm_test + 4 * r * inch * outch + (p / 8 + (p % 8) / 4 + p % 4) * inch * 4;
                     const float* r0 = bottom_blob_tm + 4 * inch * (tiles * r + i);
 #if __AVX__ || __SSE__
 #if __AVX__
@@ -970,7 +969,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
                     {
                         output0_tm[n] = sum0[n];
                     }
-#endif    // __AVX__ || __SSE__
+#endif // __AVX__ || __SSE__
                     output0_tm += 36;
                 }
             }
@@ -1005,7 +1004,7 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
         int w_tm = outw_align / 4 * 6;
         int h_tm = outh_align / 4 * 6;
 
-        int nColBlocks = h_tm / 6;    // may be the block num in Feathercnn
+        int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
         int nRowBlocks = w_tm / 6;
 
         const int tiles = nColBlocks * nRowBlocks;
@@ -1118,12 +1117,11 @@ void conv3x3s1_winograd43_sse(float* bottom_blob, float* top_blob, float* kernel
 
 void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kernel_wino, int inch, int outch)
 {
-    float* kernel_tm = ( float* )sys_malloc((unsigned long)6 * 6 * inch * outch * sizeof(float));
+    float* kernel_tm = (float*)sys_malloc((unsigned long)6 * 6 * inch * outch * sizeof(float));
 
     // G
     const float ktm[6][3] = {
-        {1.0f / 4, 0.0f, 0.0f},           {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6},
-        {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}};
+        {1.0f / 4, 0.0f, 0.0f}, {-1.0f / 6, -1.0f / 6, -1.0f / 6}, {-1.0f / 6, 1.0f / 6, -1.0f / 6}, {1.0f / 24, 1.0f / 12, 1.0f / 6}, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}};
 
 #pragma omp parallel for
     for (int p = 0; p < outch; p++)
@@ -1166,14 +1164,14 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne
         int p = 0;
         for (; p + 7 < outch; p += 8)
         {
-            const float* kernel0 = ( const float* )kernel_tm + p * inch * 36;
-            const float* kernel1 = ( const float* )kernel_tm + (p + 1) * inch * 36;
-            const float* kernel2 = ( const float* )kernel_tm + (p + 2) * inch * 36;
-            const float* kernel3 = ( const float* )kernel_tm + (p + 3) * inch * 36;
-            const float* kernel4 = ( const float* )kernel_tm + (p + 4) * inch * 36;
-            const float* kernel5 = ( const float* )kernel_tm + (p + 5) * inch * 36;
-            const float* kernel6 = ( const float* )kernel_tm + (p + 6) * inch * 36;
-            const float* kernel7 = ( const float* )kernel_tm + (p + 7) * inch * 36;
+            const float* kernel0 = (const float*)kernel_tm + p * inch * 36;
+            const float* kernel1 = (const float*)kernel_tm + (p + 1) * inch * 36;
+            const float* kernel2 = (const float*)kernel_tm + (p + 2) * inch * 36;
+            const float* kernel3 = (const float*)kernel_tm + (p + 3) * inch * 36;
+            const float* kernel4 = (const float*)kernel_tm + (p + 4) * inch * 36;
+            const float* kernel5 = (const float*)kernel_tm + (p + 5) * inch * 36;
+            const float* kernel6 = (const float*)kernel_tm + (p + 6) * inch * 36;
+            const float* kernel7 = (const float*)kernel_tm + (p + 7) * inch * 36;
 
             float* ktmp = kernel_tm_test + p / 8 * inch * 32;
 
@@ -1233,10 +1231,10 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne
 
         for (; p + 3 < outch; p += 4)
         {
-            const float* kernel0 = ( const float* )kernel_tm + p * inch * 36;
-            const float* kernel1 = ( const float* )kernel_tm + (p + 1) * inch * 36;
-            const float* kernel2 = ( const float* )kernel_tm + (p + 2) * inch * 36;
-            const float* kernel3 = ( const float* )kernel_tm + (p + 3) * inch * 36;
+            const float* kernel0 = (const float*)kernel_tm + p * inch * 36;
+            const float* kernel1 = (const float*)kernel_tm + (p + 1) * inch * 36;
+            const float* kernel2 = (const float*)kernel_tm + (p + 2) * inch * 36;
+            const float* kernel3 = (const float*)kernel_tm + (p + 3) * inch * 36;
 
             float* ktmp = kernel_tm_test + (p / 8 + (p % 8) / 4) * inch * 16;
             for (int q = 0; q < inch; q++)
@@ -1271,7 +1269,7 @@ void conv3x3s1_winograd43_transform_kernel_sse(const float* kernel, float* kerne
 
         for (; p < outch; p++)
         {
-            const float* kernel0 = ( const float* )kernel_tm + p * inch * 36;
+            const float* kernel0 = (const float*)kernel_tm + p * inch * 36;
             float* ktmp = kernel_tm_test + (p / 8 + (p % 8) / 4 + p % 4) * inch * 4;
 
             for (int q = 0; q < inch; q++)
@@ -1305,7 +1303,7 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
     int pad_h = param->pad_h0;
     int pad_w = param->pad_w0;
 
-    float* kernel = ( float* )filter_tensor->data;
+    float* kernel = (float*)filter_tensor->data;
 
     if (!priv_info->external_interleave_mem)
     {
@@ -1325,17 +1323,17 @@ int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tens
 
     int outw = block_w * TILE;
     int outh = block_h * TILE;
-    priv_info->input_pad = ( float* )sys_malloc((unsigned long)batch * input_c * pad_inhw * sizeof(float));
+    priv_info->input_pad = (float*)sys_malloc((unsigned long)batch * input_c * pad_inhw * sizeof(float));
     memset(priv_info->input_pad, 0, (unsigned long)batch * input_c * pad_inhw * sizeof(float));
-    priv_info->dot_block = ( float* )sys_malloc(ELEM_SIZE * (unsigned long)block * output_c * sizeof(float));
-    priv_info->transform_input = ( float* )sys_malloc(ELEM_SIZE * (unsigned long)block * input_c * sizeof(float));
+    priv_info->dot_block = (float*)sys_malloc(ELEM_SIZE * (unsigned long)block * output_c * sizeof(float));
+    priv_info->transform_input = (float*)sys_malloc(ELEM_SIZE * (unsigned long)block * input_c * sizeof(float));
     priv_info->output_bordered = NULL;
     if (outw != output_w || outh != output_h)
     {
-        priv_info->output_bordered = ( float* )sys_malloc((unsigned long)outw * outh * output_c * sizeof(float));
+        priv_info->output_bordered = (float*)sys_malloc((unsigned long)outw * outh * output_c * sizeof(float));
     }
 
-    conv3x3s1_winograd43_transform_kernel_sse(kernel, ( float* )priv_info->interleave_buffer, input_c, output_c);
+    conv3x3s1_winograd43_transform_kernel_sse(kernel, (float*)priv_info->interleave_buffer, input_c, output_c);
 
     return 0;
 }
@@ -1416,11 +1414,11 @@ int wino_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor,
     int padded_in_hw = padded_in_h * padded_in_w;
 
     /* buffer addr */
-    float* input = ( float* )input_tensor->data;
-    float* output = ( float* )output_tensor->data;
+    float* input = (float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
     float* biases = NULL;
     if (bias_tensor != NULL)
-        biases = ( float* )bias_tensor->data;
+        biases = (float*)bias_tensor->data;
 
     for (int i = 0; i < batch; i++)
     {
@@ -1429,9 +1427,9 @@ int wino_conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor,
             pad_0_align_3D((float*)priv_info->input_pad + i * in_c * padded_in_h * padded_in_w, input + i * in_c * in_h * in_w,
                            in_h, in_w, padded_in_h, padded_in_w, in_c, pad_h0, pad_w0);
             conv3x3s1_winograd43_sse((float*)priv_info->input_pad + i * in_c * padded_in_h * padded_in_w + g * input_size_g,
-                output + i * out_c * out_h * out_w, (float*)priv_info->interleave_buffer, (float*)priv_info->dot_block, 
-                (float*)priv_info->transform_input, (float*)priv_info->output_bordered,
-                biases, padded_in_w, padded_in_h, in_c, out_w, out_h, out_c, num_thread);
+                                     output + i * out_c * out_h * out_w, (float*)priv_info->interleave_buffer, (float*)priv_info->dot_block,
+                                     (float*)priv_info->transform_input, (float*)priv_info->output_bordered,
+                                     biases, padded_in_w, padded_in_h, in_c, out_w, out_h, out_c, num_thread);
         }
     }
     if (act_type >= 0)
diff --git a/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.h b/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.h
index 3cae478fb..2f3201f44 100644
--- a/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.h
+++ b/source/device/cpu/op/conv/x86/wino_conv_kernel_x86.h
@@ -38,7 +38,6 @@
 #include <immintrin.h>
 #endif
 
-
 int wino_conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor,
                          struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param);
 
diff --git a/source/device/cpu/op/crop/crop_ref.c b/source/device/cpu/op/crop/crop_ref.c
index 2d89b45a6..f59650a39 100644
--- a/source/device/cpu/op/crop/crop_ref.c
+++ b/source/device/cpu/op/crop/crop_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct crop_param* param,
                          int num_thread)
 {
@@ -71,8 +70,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens
                             for (int w = 0; w < oDataW; w++)
                             {
                                 int i_w = w + offsetW;
-                                output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] =
-                                    input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
+                                output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
                             }
                         }
                     }
@@ -93,8 +91,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens
                             for (int w = 0; w < oDataW; w++)
                             {
                                 int i_w = w + param->offset_w;
-                                output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] =
-                                    input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
+                                output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
                             }
                         }
                     }
@@ -118,8 +115,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens
                         for (int w = 0; w < oDataW; w++)
                         {
                             int i_w = param->offset_w + w;
-                            output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] =
-                                input[n * iDataC * iDataH * iDataW + i_c * iDataH * iDataW + i_h * iDataW + i_w];
+                            output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + i_c * iDataH * iDataW + i_h * iDataW + i_w];
                         }
                     }
                 }
@@ -137,8 +133,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens
                         for (int w = 0; w < oDataW; w++)
                         {
                             int i_w = param->offset_w + w;
-                            output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] =
-                                input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
+                            output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
                         }
                     }
                 }
@@ -150,7 +145,7 @@ static int ref_crop_fp32(struct tensor* input_tensor, struct tensor* output_tens
 }
 
 static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct crop_param* param,
-                         int num_thread)
+                          int num_thread)
 {
     uint8_t* input = (uint8_t*)input_tensor->data;
     uint8_t* output = (uint8_t*)output_tensor->data;
@@ -183,8 +178,7 @@ static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_ten
                             for (int w = 0; w < oDataW; w++)
                             {
                                 int i_w = w + offsetW;
-                                output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] =
-                                    input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
+                                output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
                             }
                         }
                     }
@@ -205,8 +199,7 @@ static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_ten
                             for (int w = 0; w < oDataW; w++)
                             {
                                 int i_w = w + param->offset_w;
-                                output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] =
-                                    input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
+                                output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
                             }
                         }
                     }
@@ -230,8 +223,7 @@ static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_ten
                         for (int w = 0; w < oDataW; w++)
                         {
                             int i_w = param->offset_w + w;
-                            output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] =
-                                input[n * iDataC * iDataH * iDataW + i_c * iDataH * iDataW + i_h * iDataW + i_w];
+                            output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + i_c * iDataH * iDataW + i_h * iDataW + i_w];
                         }
                     }
                 }
@@ -249,8 +241,7 @@ static int ref_crop_uint8(struct tensor* input_tensor, struct tensor* output_ten
                         for (int w = 0; w < oDataW; w++)
                         {
                             int i_w = param->offset_w + w;
-                            output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] =
-                                input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
+                            output[n * oDataC * oDataH * oDataW + c * oDataH * oDataW + h * oDataW + w] = input[n * iDataC * iDataH * iDataW + c * iDataH * iDataW + i_h * iDataW + i_w];
                         }
                     }
                 }
@@ -278,11 +269,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct crop_param* crop_param = ( struct crop_param* )ir_node->op.param_mem;
+    struct crop_param* crop_param = (struct crop_param*)ir_node->op.param_mem;
 
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ref_crop_fp32(input_tensor, output_tensor, crop_param, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ref_crop_uint8(input_tensor, output_tensor, crop_param, exec_graph->num_thread);
 
     return 0;
diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c
index 1daba216a..360f061ea 100644
--- a/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c
+++ b/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c
@@ -36,7 +36,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -54,10 +53,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem;
+    struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem;
 
-    if (deconv_dw_run(input_tensor, weight_tensor, bias_tensor, output_tensor, deconv_param, num_thread, cpu_affinity) <
-        0)
+    if (deconv_dw_run(input_tensor, weight_tensor, bias_tensor, output_tensor, deconv_param, num_thread, cpu_affinity) < 0)
     {
         TLOG_ERR("hcl conv run failed\n");
         // set_tengine_errno(EFAULT);
@@ -79,7 +77,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
-    struct deconv_param* param = ( struct deconv_param* )exec_node->op.param_mem;
+    struct deconv_param* param = (struct deconv_param*)exec_node->op.param_mem;
     struct node* ir_node = exec_node;
     struct graph* ir_graph = ir_node->graph;
 
@@ -117,8 +115,7 @@ static struct node_ops hcl_node_ops = {.prerun = NULL,
                                        .postrun = NULL,
                                        .init_node = init_node,
                                        .release_node = release_node,
-                                       .score = score
-};
+                                       .score = score};
 
 int register_deconv_dw_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.c
index e5e50f111..18d07bc5b 100644
--- a/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.c
+++ b/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.c
@@ -37,7 +37,6 @@
 #else
 #endif
 
-
 inline static float do_activation(float input, int activation)
 {
     if (activation == 0)
@@ -185,45 +184,45 @@ inline static void deconv_dw_genreal_3x3s2(const float* input, const float* kern
                 float32x4_t input_4 = vld1q_f32(cur_input);
 
                 // out row 0
-                float32x4_t out_00 = vmulq_lane_f32(input_4, vget_low_f32(_k0), 0);    // 0,2,4,6
-                float32x4_t out_01 = vmulq_lane_f32(input_4, vget_low_f32(_k0), 1);    // 1,3,5,7
-                float32x4_t out_02 = vmulq_lane_f32(input_4, vget_high_f32(_k0), 0);    // 2,4,6,8
+                float32x4_t out_00 = vmulq_lane_f32(input_4, vget_low_f32(_k0), 0);  // 0,2,4,6
+                float32x4_t out_01 = vmulq_lane_f32(input_4, vget_low_f32(_k0), 1);  // 1,3,5,7
+                float32x4_t out_02 = vmulq_lane_f32(input_4, vget_high_f32(_k0), 0); // 2,4,6,8
 
                 float32x4x2_t out_0 = vld2q_f32(cur_out0);
-                out_0.val[0] = vaddq_f32(out_0.val[0], out_00);    // 0,2,4,6
-                out_0.val[1] = vaddq_f32(out_0.val[1], out_01);    // 1,3,5,7
+                out_0.val[0] = vaddq_f32(out_0.val[0], out_00); // 0,2,4,6
+                out_0.val[1] = vaddq_f32(out_0.val[1], out_01); // 1,3,5,7
                 vst2q_f32(cur_out0, out_0);
 
                 out_0 = vld2q_f32(cur_out0 + 2);
-                out_0.val[0] = vaddq_f32(out_0.val[0], out_02);    // 2,4,6,8
+                out_0.val[0] = vaddq_f32(out_0.val[0], out_02); // 2,4,6,8
                 vst2q_f32(cur_out0 + 2, out_0);
 
                 // out row 1
-                float32x4_t out_10 = vmulq_lane_f32(input_4, vget_low_f32(_k1), 0);    // 0,2,4,6
-                float32x4_t out_11 = vmulq_lane_f32(input_4, vget_low_f32(_k1), 1);    // 1,3,5,7
-                float32x4_t out_12 = vmulq_lane_f32(input_4, vget_high_f32(_k1), 0);    // 2,4,6,8
+                float32x4_t out_10 = vmulq_lane_f32(input_4, vget_low_f32(_k1), 0);  // 0,2,4,6
+                float32x4_t out_11 = vmulq_lane_f32(input_4, vget_low_f32(_k1), 1);  // 1,3,5,7
+                float32x4_t out_12 = vmulq_lane_f32(input_4, vget_high_f32(_k1), 0); // 2,4,6,8
 
                 float32x4x2_t out_1 = vld2q_f32(cur_out1);
-                out_1.val[0] = vaddq_f32(out_1.val[0], out_10);    // 0,2,4,6
-                out_1.val[1] = vaddq_f32(out_1.val[1], out_11);    // 1,3,5,7
+                out_1.val[0] = vaddq_f32(out_1.val[0], out_10); // 0,2,4,6
+                out_1.val[1] = vaddq_f32(out_1.val[1], out_11); // 1,3,5,7
                 vst2q_f32(cur_out1, out_1);
 
                 out_1 = vld2q_f32(cur_out1 + 2);
-                out_1.val[0] = vaddq_f32(out_1.val[0], out_12);    // 2,4,6,8
+                out_1.val[0] = vaddq_f32(out_1.val[0], out_12); // 2,4,6,8
                 vst2q_f32(cur_out1 + 2, out_1);
 
                 // out row 2
-                float32x4_t out_20 = vmulq_lane_f32(input_4, vget_low_f32(_k2), 0);    // 0,2,4,6
-                float32x4_t out_21 = vmulq_lane_f32(input_4, vget_low_f32(_k2), 1);    // 1,3,5,7
-                float32x4_t out_22 = vmulq_lane_f32(input_4, vget_high_f32(_k2), 0);    // 2,4,6,8
+                float32x4_t out_20 = vmulq_lane_f32(input_4, vget_low_f32(_k2), 0);  // 0,2,4,6
+                float32x4_t out_21 = vmulq_lane_f32(input_4, vget_low_f32(_k2), 1);  // 1,3,5,7
+                float32x4_t out_22 = vmulq_lane_f32(input_4, vget_high_f32(_k2), 0); // 2,4,6,8
 
                 float32x4x2_t out_2 = vld2q_f32(cur_out2);
-                out_2.val[0] = vaddq_f32(out_2.val[0], out_20);    // 0,2,4,6
-                out_2.val[1] = vaddq_f32(out_2.val[1], out_21);    // 1,3,5,7
+                out_2.val[0] = vaddq_f32(out_2.val[0], out_20); // 0,2,4,6
+                out_2.val[1] = vaddq_f32(out_2.val[1], out_21); // 1,3,5,7
                 vst2q_f32(cur_out2, out_2);
 
                 out_2 = vld2q_f32(cur_out2 + 2);
-                out_2.val[0] = vaddq_f32(out_2.val[0], out_22);    // 2,4,6,8
+                out_2.val[0] = vaddq_f32(out_2.val[0], out_22); // 2,4,6,8
                 vst2q_f32(cur_out2 + 2, out_2);
 
                 cur_input += 4;
@@ -472,12 +471,12 @@ int deconv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, str
     int output_size = out_c * out_h * out_w;
     int out_c_align = ((out_c + 3) & -4);
     /* buffer addr */
-    float* input_buf = ( float* )input_tensor->data;
-    float* kernel_buf = ( float* )filter_tensor->data;
-    float* output_buf = ( float* )output_tensor->data;
-    float* biases_buf = ( float* )bias_tensor->data;
+    float* input_buf = (float*)input_tensor->data;
+    float* kernel_buf = (float*)filter_tensor->data;
+    float* output_buf = (float*)output_tensor->data;
+    float* biases_buf = (float*)bias_tensor->data;
 
-    for (int n = 0; n < batch; n++)    // batch size
+    for (int n = 0; n < batch; n++) // batch size
     {
         float* cur_input = input_buf + n * input_size * group;
         float* cur_output = output_buf + n * output_size * group;
@@ -510,7 +509,7 @@ int deconv_dw_run(struct tensor* input_tensor, struct tensor* filter_tensor, str
         {
             int out_h_pad = out_h + pads[0] * 2;
             int out_w_pad = out_w + pads[1] * 2;
-            float* output_buf = ( float* )malloc(sizeof(float) * group * out_h_pad * out_w_pad + 128);
+            float* output_buf = (float*)malloc(sizeof(float) * group * out_h_pad * out_w_pad + 128);
 
             if (stride_h == 1 && kernel_h == 4)
             {
diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.h b/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.h
index 091f7a1d8..93576a691 100644
--- a/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.h
+++ b/source/device/cpu/op/deconv/cortex_a/deconv_dw_kernel_arm.h
@@ -28,13 +28,12 @@
 
 #include "graph/tensor.h"
 
-
-int deconv_dw_run(struct tensor* input_tensor , \
-                 struct tensor* filter_tensor ,\
-                 struct tensor* bias_tensor ,  \
-                 struct tensor* output_tensor , \
-                 struct deconv_param* param, \
-                 int num_thread, \
-                 int cpu_affinity) ;
+int deconv_dw_run(struct tensor* input_tensor,
+                  struct tensor* filter_tensor,
+                  struct tensor* bias_tensor,
+                  struct tensor* output_tensor,
+                  struct deconv_param* param,
+                  int num_thread,
+                  int cpu_affinity);
 
 #endif
diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c
index fa5883320..a81fa1e8c 100644
--- a/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c
+++ b/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c
@@ -36,7 +36,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -45,13 +44,13 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* filter_tensor;
     struct tensor* output_tensor;
 
-    struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )exec_node->ops_priv;
+    struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)exec_node->ops_priv;
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem;
+    struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem;
 
     /* prerun now */
     if (deconv_hcl_prerun(input_tensor, filter_tensor, output_tensor, deconv_priv_info, deconv_param) < 0)
@@ -81,11 +80,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem;
-    struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )exec_node->ops_priv;
+    struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem;
+    struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)exec_node->ops_priv;
 
     if (deconv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, deconv_priv_info, deconv_param,
-                       num_thread, cpu_affinity) < 0)
+                       num_thread, cpu_affinity)
+        < 0)
     {
         TLOG_ERR("hcl deconv run failed\n");
         // set_tengine_errno(EFAULT);
@@ -102,7 +102,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )exec_node->ops_priv;
+    struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)exec_node->ops_priv;
 
     if (deconv_hcl_postrun(deconv_priv_info) < 0)
     {
@@ -123,8 +123,8 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem;
-    struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )sys_malloc(sizeof(struct deconv_priv_info));
+    struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem;
+    struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)sys_malloc(sizeof(struct deconv_priv_info));
 
     if (deconv_priv_info == NULL)
     {
@@ -140,7 +140,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
 
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct deconv_priv_info* deconv_priv_info = ( struct deconv_priv_info* )exec_node->ops_priv;
+    struct deconv_priv_info* deconv_priv_info = (struct deconv_priv_info*)exec_node->ops_priv;
     sys_free(deconv_priv_info);
     exec_node->ops_priv = NULL;
     return 0;
@@ -157,8 +157,7 @@ static struct node_ops hcl_node_ops = {.prerun = prerun,
                                        .postrun = postrun,
                                        .init_node = init_node,
                                        .release_node = release_node,
-                                       .score = score
-};
+                                       .score = score};
 
 int register_deconv_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.c
index efb532a9f..e69ae1b46 100644
--- a/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.c
+++ b/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.c
@@ -28,7 +28,6 @@
 #include <stdlib.h>
 #include <math.h>
 
-
 #ifdef __aarch64__
 #define PER_OUT_CHAN 16
 void sgemm_4x16_deconv_a72(float* input, float* kernel, long kernel_size, float* output, long weight_size);
@@ -57,37 +56,37 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern
     float* cur_kernel_interleaved = kernel_interleaved;
 
     // interleave PER_OUT_CHAN kernels
-    for(i = 0; i + PER_OUT_CHAN - 1 < kernel_size; i += PER_OUT_CHAN)
+    for (i = 0; i + PER_OUT_CHAN - 1 < kernel_size; i += PER_OUT_CHAN)
     {
-        for(j = 0; j < kernel_chan; j++)
+        for (j = 0; j < kernel_chan; j++)
         {
-            for(k = 0; k < PER_OUT_CHAN; k++)
+            for (k = 0; k < PER_OUT_CHAN; k++)
                 *(cur_kernel_interleaved++) = kernel[j * kernel_size + i + k];
         }
     }
-    for(; i < (kernel_size & -4); i += 4)
+    for (; i < (kernel_size & -4); i += 4)
     {
-        for(j = 0; j < kernel_chan; j++)
+        for (j = 0; j < kernel_chan; j++)
         {
-            for(k = 0; k < 4; k++)
+            for (k = 0; k < 4; k++)
                 *(cur_kernel_interleaved++) = kernel[j * kernel_size + i + k];
         }
     }
     // last 4 kernel
     int kernel_size3 = kernel_chan & 0x3;
-    if(kernel_size3)
+    if (kernel_size3)
     {
-        for(j = 0; j < kernel_chan; j++)
+        for (j = 0; j < kernel_chan; j++)
         {
-            for(k = 0; k < kernel_size3; k++)
+            for (k = 0; k < kernel_size3; k++)
                 *(cur_kernel_interleaved++) = kernel[j * kernel_size + i + k];
-            for(; k < 4; k++)
-            *(cur_kernel_interleaved++) = 0.0;
+            for (; k < 4; k++)
+                *(cur_kernel_interleaved++) = 0.0;
         }
     }
 }
 
-static void interleave(struct tensor * filter, struct deconv_priv_info*  priv_info, struct deconv_param* param)
+static void interleave(struct tensor* filter, struct deconv_priv_info* priv_info, struct deconv_param* param)
 {
     int group = param->group;
     int out_chan = filter->dims[0] / group;
@@ -98,7 +97,7 @@ static void interleave(struct tensor * filter, struct deconv_priv_info*  priv_in
 
     float* kernel = filter->data;
     float* interleave_buf = priv_info->interleave_buffer;
-    for(int g = 0; g < group; g++)
+    for (int g = 0; g < group; g++)
     {
         float* cur_kernel = kernel + g * kernel_size * in_chan;
         float* cur_interleave = interleave_buf + g * kernel_size_algin;
@@ -113,26 +112,26 @@ static void transpose_input(float* input, float* inputT, int input_w, int input_
 
     float* cur_input = inputT;
 
-    for(i = 0; i < (input_w & -4); i += 4)
-        for(j = 0; j < input_h; j++)
-            for(k = 0; k < 4; k++)
+    for (i = 0; i < (input_w & -4); i += 4)
+        for (j = 0; j < input_h; j++)
+            for (k = 0; k < 4; k++)
                 *cur_input++ = *(input + j * input_w + i + k);
 
-    if(input_w3)
+    if (input_w3)
     {
-        for(j = 0; j < input_h; j++)
+        for (j = 0; j < input_h; j++)
         {
-            for(k = 0; k < input_w3; k++)
+            for (k = 0; k < input_w3; k++)
                 *cur_input++ = *(input + j * input_w + i + k);
-            for(; k < 4; k++)
+            for (; k < 4; k++)
                 *cur_input++ = 0;
         }
     }
 }
 
 static void col2im(float* col, float* im, float* bias, int output_ch, int output_x, int output_y,
-            int kernel_x, int kernel_y, int stride_x, int stride_y, int dilation_x, int dilation_y, int pad_x,
-            int pad_y, int input_x, int input_y)
+                   int kernel_x, int kernel_y, int stride_x, int stride_y, int dilation_x, int dilation_y, int pad_x,
+                   int pad_y, int input_x, int input_y)
 {
     float* cur_col;
     int imx_start, imy_start, ix, iy, kch, kx, ky, imx, imy;
@@ -143,49 +142,49 @@ static void col2im(float* col, float* im, float* bias, int output_ch, int output
     int is_4x4 = (kernel_x == 4 && kernel_y == 4 && is_nodilation);
     int is_8x8 = (kernel_x == 8 && kernel_y == 8 && is_nodilation);
     /* init bias */
-    if(bias == NULL)
+    if (bias == NULL)
     {
-        for(int i = 0; i < (output_xy * output_ch); i++)
+        for (int i = 0; i < (output_xy * output_ch); i++)
             im[i] = 0;
     }
     else
     {
         float* cur_im = im;
-        for(int i = 0; i < output_ch; i++)
-            for(int j = 0; j < output_xy; j++)
+        for (int i = 0; i < output_ch; i++)
+            for (int j = 0; j < output_xy; j++)
                 *cur_im++ = bias[i];
     }
 
-    if(is_4x4)
+    if (is_4x4)
     {
-        for(iy = 0; iy < input_y; iy++)
+        for (iy = 0; iy < input_y; iy++)
         {
             imy_start = iy * stride_y - pad_y;
-            for(ix = 0; ix < input_x; ix++)
+            for (ix = 0; ix < input_x; ix++)
             {
                 imx_start = ix * stride_x - pad_x;
                 cur_col = col + (iy * input_x + ix) * weight_size;
-                if(iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1))
+                if (iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1))
                 {
-                    for(kch = 0; kch < output_ch; kch++)
-                        for(ky = 0; ky < 4; ky++)
+                    for (kch = 0; kch < output_ch; kch++)
+                        for (ky = 0; ky < 4; ky++)
                         {
                             imy = imy_start + ky;
-                            for(kx = 0; kx < 4; kx++)
-                                *(im + output_xy * kch + output_x * imy + imx_start + kx) += *cur_col++ ;
+                            for (kx = 0; kx < 4; kx++)
+                                *(im + output_xy * kch + output_x * imy + imx_start + kx) += *cur_col++;
                         }
                 }
                 else
                 {
-                    for(kch = 0; kch < output_ch; kch++)
+                    for (kch = 0; kch < output_ch; kch++)
                     {
-                        for(ky = 0; ky < 4; ky++)
+                        for (ky = 0; ky < 4; ky++)
                         {
                             imy = imy_start + ky;
-                            for(kx = 0; kx < 4; kx++)
+                            for (kx = 0; kx < 4; kx++)
                             {
                                 imx = imx_start + kx;
-                                if(imx >= 0 && imx < output_x && imy >= 0 && imy < output_y)
+                                if (imx >= 0 && imx < output_x && imy >= 0 && imy < output_y)
                                     *(im + output_xy * kch + output_x * imy + imx) += *cur_col;
                                 cur_col++;
                             }
@@ -195,35 +194,35 @@ static void col2im(float* col, float* im, float* bias, int output_ch, int output
             }
         }
     }
-    else if(is_8x8)
+    else if (is_8x8)
     {
-        for(iy = 0; iy < input_y; iy++)
+        for (iy = 0; iy < input_y; iy++)
         {
             imy_start = iy * stride_y - pad_y;
-            for(ix = 0; ix < input_x; ix++)
+            for (ix = 0; ix < input_x; ix++)
             {
                 imx_start = ix * stride_x - pad_x;
                 cur_col = col + (iy * input_x + ix) * weight_size;
-                if(iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1))
+                if (iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1))
                 {
-                    for(kch = 0; kch < output_ch; kch++)
-                        for(ky = 0; ky < 8; ky++)
+                    for (kch = 0; kch < output_ch; kch++)
+                        for (ky = 0; ky < 8; ky++)
                         {
                             imy = imy_start + ky;
-                            for(kx = 0; kx < 8; kx++)
+                            for (kx = 0; kx < 8; kx++)
                                 *(im + output_xy * kch + output_x * imy + imx_start + kx) += *cur_col++;
                         }
                 }
                 else
                 {
-                    for(kch = 0; kch < output_ch; kch++)
-                        for(ky = 0; ky < 8; ky++)
+                    for (kch = 0; kch < output_ch; kch++)
+                        for (ky = 0; ky < 8; ky++)
                         {
                             imy = imy_start + ky;
-                            for(kx = 0; kx < 8; kx++)
+                            for (kx = 0; kx < 8; kx++)
                             {
                                 imx = imx_start + kx;
-                                if(imx >= 0 && imx < output_x && imy >= 0 && imy < output_y)
+                                if (imx >= 0 && imx < output_x && imy >= 0 && imy < output_y)
                                     *(im + output_xy * kch + output_x * imy + imx) += *cur_col;
                                 cur_col++;
                             }
@@ -235,20 +234,20 @@ static void col2im(float* col, float* im, float* bias, int output_ch, int output
     // general case
     else
     {
-        for(iy = 0; iy < input_y; iy++)
+        for (iy = 0; iy < input_y; iy++)
         {
             imy_start = iy * stride_y - pad_y;
-            for(ix = 0; ix < input_x; ix++)
+            for (ix = 0; ix < input_x; ix++)
             {
                 imx_start = ix * stride_x - pad_x;
                 cur_col = col + (iy * input_x + ix) * weight_size;
-                if(iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1))
+                if (iy != 0 && iy != (input_y - 1) && ix != 0 && ix != (input_x - 1))
                 {
-                    for(kch = 0; kch < output_ch; kch++)
-                        for(ky = 0; ky < kernel_y; ky++)
+                    for (kch = 0; kch < output_ch; kch++)
+                        for (ky = 0; ky < kernel_y; ky++)
                         {
                             imy = imy_start + ky * dilation_y;
-                            for(kx = 0; kx < kernel_x; kx++)
+                            for (kx = 0; kx < kernel_x; kx++)
                             {
                                 imx = imx_start + kx * dilation_x;
                                 *(im + output_xy * kch + output_x * imy + imx) += *cur_col++;
@@ -257,16 +256,16 @@ static void col2im(float* col, float* im, float* bias, int output_ch, int output
                 }
                 else
                 {
-                    for(kch = 0; kch < output_ch; kch++)
+                    for (kch = 0; kch < output_ch; kch++)
                     {
-                        for(ky = 0; ky < kernel_y; ky++)
+                        for (ky = 0; ky < kernel_y; ky++)
                         {
                             imy = imy_start + ky * dilation_y;
-                            for(kx = 0; kx < kernel_x; kx++)
+                            for (kx = 0; kx < kernel_x; kx++)
                             {
                                 imx = imx_start + kx * dilation_x;
                                 float out = bias[kch];
-                                if(imx >= 0 && imx < output_x && imy >= 0 && imy < output_y)
+                                if (imx >= 0 && imx < output_x && imy >= 0 && imy < output_y)
                                     *(im + output_xy * kch + output_x * imy + imx) += *cur_col;
                                 cur_col++;
                             }
@@ -282,23 +281,23 @@ static void sgemm_set(float* input, float* kernel, float* col, int in_ch, int in
                       int kernel_start, int kernel_end, int num_thread, int cpu_affinity)
 {
     int nn_kernel = (kernel_end - kernel_start) / PER_OUT_CHAN;
-    int input_end3 = in_hw & 0x3;    
+    int input_end3 = in_hw & 0x3;
 
     if (input_end3)
     {
-        #pragma omp parallel for num_threads(num_thread)
-        for (int pp=0; pp<nn_kernel; pp++)
+#pragma omp parallel for num_threads(num_thread)
+        for (int pp = 0; pp < nn_kernel; pp++)
         {
             int p = kernel_start + pp * PER_OUT_CHAN;
 
-            float* cur_kernel = (float* )(kernel + p * in_ch);
+            float* cur_kernel = (float*)(kernel + p * in_ch);
 
             int i = 0;
-            for(i = 0; i + 3 < in_hw; i += 4)
+            for (i = 0; i + 3 < in_hw; i += 4)
 #ifdef __aarch64__
             {
-                float* cur_input = (float* )(input +  i * in_ch);
-                float* cur_col = ( float* )(col + i * kernel_size + p);
+                float* cur_input = (float*)(input + i * in_ch);
+                float* cur_col = (float*)(col + i * kernel_size + p);
                 if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
                     sgemm_4x16_deconv_a53(cur_input, cur_kernel, in_ch, cur_col, kernel_size);
                 else
@@ -306,21 +305,21 @@ static void sgemm_set(float* input, float* kernel, float* col, int in_ch, int in
             }
             {
                 float result[64];
-                float* cur_input = (float* )(input +  i * in_ch);
+                float* cur_input = (float*)(input + i * in_ch);
                 if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
                     sgemm_4x16_deconv_a53(cur_input, cur_kernel, in_ch, result, 16);
                 else
                     sgemm_4x16_deconv_a72(cur_input, cur_kernel, in_ch, result, 16);
-                for(int j = 0; j < (input_end3); j++)
+                for (int j = 0; j < (input_end3); j++)
                 {
-                    for(int k = 0; k < 16; k++)
+                    for (int k = 0; k < 16; k++)
                         *(col + (i + j) * kernel_size + p + k) = result[(j << 4) + k];
                 }
             }
 #else
             {
-                float* cur_input = (float* )(input +  i * in_ch);
-                float* cur_col = ( float* )(col + i * kernel_size + p);
+                float* cur_input = (float*)(input + i * in_ch);
+                float* cur_col = (float*)(col + i * kernel_size + p);
                 if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
                     sgemm_4x12_deconv_a7(cur_input, cur_kernel, in_ch, cur_col, kernel_size);
                 else
@@ -328,14 +327,14 @@ static void sgemm_set(float* input, float* kernel, float* col, int in_ch, int in
             }
             {
                 float result[48];
-                float* cur_input = (float* )(input +  i * in_ch);
+                float* cur_input = (float*)(input + i * in_ch);
                 if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
                     sgemm_4x12_deconv_a7(cur_input, cur_kernel, in_ch, result, 12);
                 else
                     sgemm_4x12_deconv_a17(cur_input, cur_kernel, in_ch, result, 12);
-                for(int j = 0; j < (input_end3); j++)
+                for (int j = 0; j < (input_end3); j++)
                 {
-                    for(int k = 0; k < 12; k++)
+                    for (int k = 0; k < 12; k++)
                         *(col + (i + j) * kernel_size + p + k) = result[j * 12 + k];
                 }
             }
@@ -344,22 +343,22 @@ static void sgemm_set(float* input, float* kernel, float* col, int in_ch, int in
     }
     else
     {
-        #pragma omp parallel for num_threads(num_thread)
-        for (int pp=0; pp<nn_kernel; pp++)
+#pragma omp parallel for num_threads(num_thread)
+        for (int pp = 0; pp < nn_kernel; pp++)
         {
             int p = kernel_start + pp * PER_OUT_CHAN;
 
-            float* cur_kernel = (float* )(kernel + p * in_ch);
+            float* cur_kernel = (float*)(kernel + p * in_ch);
 
             int i = 0;
-            for(; i + 3 < in_hw; i += 4)
+            for (; i + 3 < in_hw; i += 4)
             {
-                float* cur_input = (float* )(input +  i * in_ch);
-                float* cur_col = ( float* )(col + i * kernel_size + p);
+                float* cur_input = (float*)(input + i * in_ch);
+                float* cur_col = (float*)(col + i * kernel_size + p);
 #ifdef __aarch64__
                 if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
                     sgemm_4x16_deconv_a53(cur_input, cur_kernel, in_ch, cur_col, kernel_size);
-                else    
+                else
                     sgemm_4x16_deconv_a72(cur_input, cur_kernel, in_ch, cur_col, kernel_size);
 #else
                 if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
@@ -373,7 +372,7 @@ static void sgemm_set(float* input, float* kernel, float* col, int in_ch, int in
 }
 
 static void sgemm4x4(float* input, float* kernel, float* col, int in_ch, int in_hw, int kernel_size,
-                      int kernel_start, int kernel_end, int num_thread, int cpu_affinity)
+                     int kernel_start, int kernel_end, int num_thread, int cpu_affinity)
 {
     float result[16];
     int input_line, kernel_num;
@@ -382,13 +381,13 @@ static void sgemm4x4(float* input, float* kernel, float* col, int in_ch, int in_
     int input_end3 = in_hw & 0x3;
     int kernel_end3 = kernel_end & 0x3;
 
-    for(kernel_num = kernel_start; kernel_num + 3 < (kernel_end & -4); kernel_num += 4)
+    for (kernel_num = kernel_start; kernel_num + 3 < (kernel_end & -4); kernel_num += 4)
     {
-        cur_kernel = ( float* )(kernel + kernel_num * in_ch);
-        for(input_line = 0; input_line < (in_hw & -4); input_line += 4)
+        cur_kernel = (float*)(kernel + kernel_num * in_ch);
+        for (input_line = 0; input_line < (in_hw & -4); input_line += 4)
         {
-            cur_input = ( float* )(input + input_line * in_ch);
-            cur_col = ( float* )(col + input_line * kernel_size + kernel_num);
+            cur_input = (float*)(input + input_line * in_ch);
+            cur_col = (float*)(col + input_line * kernel_size + kernel_num);
 #ifdef __aarch64__
             if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
                 sgemm_4x4_deconv_a53(cur_input, cur_kernel, in_ch, cur_col, kernel_size);
@@ -401,31 +400,31 @@ static void sgemm4x4(float* input, float* kernel, float* col, int in_ch, int in_
                 sgemm_4x4_deconv_a17(cur_input, cur_kernel, in_ch, cur_col, kernel_size);
 #endif
         }
-        if(input_end3)
+        if (input_end3)
         {
-            cur_input = ( float* )(input + input_line * in_ch);
+            cur_input = (float*)(input + input_line * in_ch);
 #ifdef __aarch64__
-        if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
-            sgemm_4x4_deconv_a53(cur_input, cur_kernel, in_ch, result, 4);
-        else
-            sgemm_4x4_deconv_a72(cur_input, cur_kernel, in_ch, result, 4);
+            if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
+                sgemm_4x4_deconv_a53(cur_input, cur_kernel, in_ch, result, 4);
+            else
+                sgemm_4x4_deconv_a72(cur_input, cur_kernel, in_ch, result, 4);
 #else
             if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
                 sgemm_4x4_deconv_a7(cur_input, cur_kernel, in_ch, result, 4);
             else
                 sgemm_4x4_deconv_a17(cur_input, cur_kernel, in_ch, result, 4);
 #endif
-            for(j = 0; j < (input_end3); j++)
-                for(i = 0; i < 4; i++)
+            for (j = 0; j < (input_end3); j++)
+                for (i = 0; i < 4; i++)
                     *(col + (input_line + j) * kernel_size + kernel_num + i) = result[(j << 2) + i];
         }
     }
-    if(kernel_end3)
+    if (kernel_end3)
     {
-        cur_kernel = ( float* )(kernel + kernel_num * in_ch);
-        for(input_line = 0; input_line < (in_hw & -4); input_line += 4)
+        cur_kernel = (float*)(kernel + kernel_num * in_ch);
+        for (input_line = 0; input_line < (in_hw & -4); input_line += 4)
         {
-            cur_input = ( float* )(input + input_line * in_ch);
+            cur_input = (float*)(input + input_line * in_ch);
 #ifdef __aarch64__
             if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
                 sgemm_4x4_deconv_a53(cur_input, cur_kernel, in_ch, result, 4);
@@ -437,13 +436,13 @@ static void sgemm4x4(float* input, float* kernel, float* col, int in_ch, int in_
             else
                 sgemm_4x4_deconv_a17(cur_input, cur_kernel, in_ch, result, 4);
 #endif
-            for(j = 0; j < 4; j++)
-                for(i = 0; i < kernel_end3; i++)
+            for (j = 0; j < 4; j++)
+                for (i = 0; i < kernel_end3; i++)
                     *(col + (input_line + j) * kernel_size + kernel_num + i) = result[(j << 2) + i];
         }
-        if(input_end3)
+        if (input_end3)
         {
-            cur_input = ( float* )(input + input_line * in_ch);
+            cur_input = (float*)(input + input_line * in_ch);
 #ifdef __aarch64__
             if (cpu_affinity == TENGINE_CLUSTER_LITTLE)
                 sgemm_4x4_deconv_a53(cur_input, cur_kernel, in_ch, result, 4);
@@ -455,24 +454,24 @@ static void sgemm4x4(float* input, float* kernel, float* col, int in_ch, int in_
             else
                 sgemm_4x4_deconv_a17(cur_input, cur_kernel, in_ch, result, 4);
 #endif
-            for(j = 0; j < input_end3; j++)
-                for(i = 0; i < kernel_end3; i++)
+            for (j = 0; j < input_end3; j++)
+                for (i = 0; i < kernel_end3; i++)
                     *(col + (input_line + j) * kernel_size + kernel_num + i) = result[(j << 2) + i];
         }
     }
 }
 
-int deconv_hcl_prerun(struct tensor*  input_tensor , \
-                    struct tensor*  filter_tensor ,  \
-                    struct tensor*  output_tensor , \
-                    struct deconv_priv_info*  priv_info , \
-                    struct deconv_param* param)
+int deconv_hcl_prerun(struct tensor* input_tensor,
+                      struct tensor* filter_tensor,
+                      struct tensor* output_tensor,
+                      struct deconv_priv_info* priv_info,
+                      struct deconv_param* param)
 {
     int group = param->group;
     int kernel_h = param->kernel_h;
     int kernel_w = param->kernel_w;
-    int out_ch = output_tensor->dims[1]/group;
-    int in_ch = input_tensor->dims[1]/group;
+    int out_ch = output_tensor->dims[1] / group;
+    int in_ch = input_tensor->dims[1] / group;
     int in_h = input_tensor->dims[2];
     int in_w = input_tensor->dims[3];
 
@@ -491,7 +490,6 @@ int deconv_hcl_prerun(struct tensor*  input_tensor , \
         int col_size = sizeof(float) * in_h * in_w * kernel_size + 128;
         priv_info->col_buffer = (float*)sys_malloc(col_size);
         priv_info->col_buffer_size = col_size;
-        
     }
 
     interleave(filter_tensor, priv_info, param);
@@ -499,21 +497,21 @@ int deconv_hcl_prerun(struct tensor*  input_tensor , \
     return 0;
 }
 
-int deconv_hcl_postrun(struct deconv_priv_info*  priv_info)
+int deconv_hcl_postrun(struct deconv_priv_info* priv_info)
 {
-    if(priv_info->interleave_buffer != NULL)
+    if (priv_info->interleave_buffer != NULL)
     {
         sys_free(priv_info->interleave_buffer);
         priv_info->interleave_buffer = NULL;
     }
 
-    if(priv_info->trans_input_buffer != NULL)
+    if (priv_info->trans_input_buffer != NULL)
     {
         sys_free(priv_info->trans_input_buffer);
         priv_info->trans_input_buffer = NULL;
     }
 
-    if(priv_info->col_buffer != NULL)
+    if (priv_info->col_buffer != NULL)
     {
         sys_free(priv_info->col_buffer);
         priv_info->col_buffer = NULL;
@@ -522,14 +520,14 @@ int deconv_hcl_postrun(struct deconv_priv_info*  priv_info)
     return 0;
 }
 
-int deconv_hcl_run(struct tensor* input_tensor , \
-                    struct tensor* filter_tensor , \
-                    struct tensor* bias_tensor ,  \
-                    struct tensor* output_tensor , \
-                    struct deconv_priv_info* priv_info , \
-                    struct deconv_param* param, \
-                    int num_thread, \
-                    int cpu_affinity)
+int deconv_hcl_run(struct tensor* input_tensor,
+                   struct tensor* filter_tensor,
+                   struct tensor* bias_tensor,
+                   struct tensor* output_tensor,
+                   struct deconv_priv_info* priv_info,
+                   struct deconv_param* param,
+                   int num_thread,
+                   int cpu_affinity)
 {
     /* param */
     int group = param->group;
@@ -543,7 +541,7 @@ int deconv_hcl_run(struct tensor* input_tensor , \
     int in_c = input_tensor->dims[1] / group;
     int in_h = input_tensor->dims[2];
     int in_w = input_tensor->dims[3];
-	int in_hw = in_h * in_w;
+    int in_hw = in_h * in_w;
     int input_size = in_c * in_h * in_w;
 
     int out_c = output_tensor->dims[1] / group;
@@ -553,7 +551,7 @@ int deconv_hcl_run(struct tensor* input_tensor , \
     int output_size = out_c * out_h * out_w;
 
     int kernel_size = out_c * ksize * ksize;
-    int kernel_size_g = ((kernel_size + 3)&-4 ) * in_c;
+    int kernel_size_g = ((kernel_size + 3) & -4) * in_c;
 
     /* buffer addr */
     float* input_buf = (float*)input_tensor->data;
@@ -565,9 +563,9 @@ int deconv_hcl_run(struct tensor* input_tensor , \
 
     int sgemm_set_num = kernel_size / PER_OUT_CHAN * PER_OUT_CHAN;
     int sgemm_set_remain = kernel_size % PER_OUT_CHAN;
-    for(int n = 0; n < batch; n++) // batch size
+    for (int n = 0; n < batch; n++) // batch size
     {
-        for(int g = 0; g < group; g++)
+        for (int g = 0; g < group; g++)
         {
             /* im2col */
             float* cur_input = input_buf + (n * group + g) * input_size;
@@ -576,15 +574,14 @@ int deconv_hcl_run(struct tensor* input_tensor , \
             transpose_input(cur_input, trans_input_buf, in_hw, in_c);
 
             /* gemm */
-            sgemm_set(trans_input_buf,cur_kernel, col_buf, in_c, in_hw, kernel_size, 0, sgemm_set_num, num_thread, cpu_affinity);
-            if(sgemm_set_remain)
-            	sgemm4x4(trans_input_buf,cur_kernel, col_buf, in_c, in_hw, kernel_size, sgemm_set_num, kernel_size, num_thread, cpu_affinity);
-            float* cur_bias = biases_buf? (biases_buf + g * out_c) : NULL;
-			col2im(col_buf, cur_output, cur_bias, out_c, out_w, out_h, ksize, ksize, stride,
-                           stride, dilation, dilation, pad, pad, in_w, in_h);
+            sgemm_set(trans_input_buf, cur_kernel, col_buf, in_c, in_hw, kernel_size, 0, sgemm_set_num, num_thread, cpu_affinity);
+            if (sgemm_set_remain)
+                sgemm4x4(trans_input_buf, cur_kernel, col_buf, in_c, in_hw, kernel_size, sgemm_set_num, kernel_size, num_thread, cpu_affinity);
+            float* cur_bias = biases_buf ? (biases_buf + g * out_c) : NULL;
+            col2im(col_buf, cur_output, cur_bias, out_c, out_w, out_h, ksize, ksize, stride,
+                   stride, dilation, dilation, pad, pad, in_w, in_h);
         }
     }
 
-    return 0 ;
-
+    return 0;
 }
diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.h b/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.h
index e6b04b725..591aa718b 100644
--- a/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.h
+++ b/source/device/cpu/op/deconv/cortex_a/deconv_kernel_arm.h
@@ -28,33 +28,31 @@
 
 #include "graph/tensor.h"
 
-
 struct deconv_priv_info
 {
-    float* interleave_buffer ;
+    float* interleave_buffer;
     int interleave_buffer_size;
-    float* col_buffer ; 
+    float* col_buffer;
     int col_buffer_size;
     float* trans_input_buffer;
     int trans_input_size;
 };
 
-int deconv_hcl_prerun(struct tensor*  input_tensor , \
-                    struct tensor*  filter_tensor ,  \
-                    struct tensor*  output_tensor , \
-                    struct deconv_priv_info* info ,      \
-                    struct deconv_param* param) ;
+int deconv_hcl_prerun(struct tensor* input_tensor,
+                      struct tensor* filter_tensor,
+                      struct tensor* output_tensor,
+                      struct deconv_priv_info* info,
+                      struct deconv_param* param);
 
 int deconv_hcl_postrun(struct deconv_priv_info* info);
 
-int deconv_hcl_run(struct tensor* input_tensor , \
-                 struct tensor* filter_tensor ,\
-                 struct tensor* bias_tensor ,  \
-                 struct tensor* output_tensor , \
-                 struct deconv_priv_info*  deconv_info , \
-                 struct deconv_param* param, \
-                 int num_thread, \
-                 int cpu_affinity) ;
-
+int deconv_hcl_run(struct tensor* input_tensor,
+                   struct tensor* filter_tensor,
+                   struct tensor* bias_tensor,
+                   struct tensor* output_tensor,
+                   struct deconv_priv_info* deconv_info,
+                   struct deconv_param* param,
+                   int num_thread,
+                   int cpu_affinity);
 
 #endif
diff --git a/source/device/cpu/op/deconv/deconv_ref.c b/source/device/cpu/op/deconv/deconv_ref.c
index 9919d7a81..7bdfa4b76 100644
--- a/source/device/cpu/op/deconv/deconv_ref.c
+++ b/source/device/cpu/op/deconv/deconv_ref.c
@@ -36,21 +36,20 @@
 
 #include <string.h>
 
-
 struct deconv_ref_param
 {
-    int in_shape[4];    // NCHW
-    int out_shape[3];    // CHW
-    int kernels[2];    // hw
-    int strides[2];    // hw
-    int dilations[2];    // hw
+    int in_shape[4];  // NCHW
+    int out_shape[3]; // CHW
+    int kernels[2];   // hw
+    int strides[2];   // hw
+    int dilations[2]; // hw
     int pads[2];
     int batch;
     int group;
     int activation;
     int layout;
     int zero[3];    // input, kernel, output
-    float scale[3];    // input, kernel, output
+    float scale[3]; // input, kernel, output
 };
 
 static inline float activation(float input, int activation)
@@ -102,7 +101,7 @@ static int ref_deconv_fp32(const float* input, float* output, const float* kerne
     int kernel_offset = 0;
     int output_offset = 0;
 
-    memset(( void* )output, 0, (unsigned long)output_h * output_w * output_c * batch * group * sizeof(float));
+    memset((void*)output, 0, (unsigned long)output_h * output_w * output_c * batch * group * sizeof(float));
 
     for (n = 0; n < batch; ++n)
     {
@@ -118,13 +117,11 @@ static int ref_deconv_fp32(const float* input, float* output, const float* kerne
                     {
                         if (param->layout == 0)
                         {
-                            input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w +
-                                           kc * input_h * input_w + h * input_w + w;
+                            input_offset = n * group * input_c * input_h * input_w + g * input_c * input_h * input_w + kc * input_h * input_w + h * input_w + w;
                         }
                         else
                         {
-                            input_offset = n * group * input_c * input_h * input_w + h * group * input_c * input_w +
-                                           w * group * input_c + g * input_c + kc;
+                            input_offset = n * group * input_c * input_h * input_w + h * group * input_c * input_w + w * group * input_c + g * input_c + kc;
                         }
                         input_val = input[input_offset];
                         for (c = 0; c < output_c; c++)
@@ -135,26 +132,18 @@ static int ref_deconv_fp32(const float* input, float* output, const float* kerne
                                 {
                                     cur_out_x = org_out_x + k_w * dilation_w;
                                     cur_out_y = org_out_y + k_h * dilation_h;
-                                    if (cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >= 0 &&
-                                        cur_out_y < output_h)
+                                    if (cur_out_x >= 0 && cur_out_x < output_w && cur_out_y >= 0 && cur_out_y < output_h)
                                     {
                                         if (param->layout == 0)
                                         {
-                                            kernel_offset = g * output_c * input_c * kernel_h * kernel_w +
-                                                            kc * output_c * kernel_h * kernel_w +
-                                                            c * kernel_h * kernel_w + k_h * kernel_w + k_w;
+                                            kernel_offset = g * output_c * input_c * kernel_h * kernel_w + kc * output_c * kernel_h * kernel_w + c * kernel_h * kernel_w + k_h * kernel_w + k_w;
 
-                                            output_offset = n * group * output_c * output_w * output_h +
-                                                            g * output_c * output_w * output_h +
-                                                            c * output_w * output_h + cur_out_y * output_w + cur_out_x;
+                                            output_offset = n * group * output_c * output_w * output_h + g * output_c * output_w * output_h + c * output_w * output_h + cur_out_y * output_w + cur_out_x;
                                         }
                                         else
                                         {
-                                            kernel_offset = g * output_c * input_c * kernel_h * kernel_w +
-                                                            k_h * kernel_w * output_c + k_w * output_c + c;
-                                            output_offset = n * output_h * output_w * output_c * group +
-                                                            cur_out_y * group * output_w * output_c +
-                                                            cur_out_x * group * output_c + g * output_c + c;
+                                            kernel_offset = g * output_c * input_c * kernel_h * kernel_w + k_h * kernel_w * output_c + k_w * output_c + c;
+                                            output_offset = n * output_h * output_w * output_c * group + cur_out_y * group * output_w * output_c + cur_out_x * group * output_c + g * output_c + c;
                                         }
                                         weight_val = kernel[kernel_offset];
                                         output[output_offset] += weight_val * input_val;
@@ -182,14 +171,11 @@ static int ref_deconv_fp32(const float* input, float* output, const float* kerne
                         {
                             if (param->layout == 0)
                             {
-                                output_offset = n * output_c * group * output_w * output_h +
-                                                g * output_c * output_w * output_h + c * output_h * output_w +
-                                                h * output_w + w;
+                                output_offset = n * output_c * group * output_w * output_h + g * output_c * output_w * output_h + c * output_h * output_w + h * output_w + w;
                             }
                             else
                             {
-                                output_offset = n * output_c * group * output_w * output_h +
-                                                h * output_c * group * output_w + w * output_c * group + c;
+                                output_offset = n * output_c * group * output_w * output_h + h * output_c * group * output_w + w * output_c * group + c;
                             }
                             output[output_offset] += bias_val;
                         }
@@ -214,19 +200,19 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct graph* ir_graph = ir_node->graph;
     struct graph* graph = ir_node->graph;
 
-    struct deconv_param* param = ( struct deconv_param* )(ir_node->op.param_mem);
-    struct deconv_ref_param* op_param = ( struct deconv_ref_param* )exec_node->ops_priv;
+    struct deconv_param* param = (struct deconv_param*)(ir_node->op.param_mem);
+    struct deconv_ref_param* op_param = (struct deconv_ref_param*)exec_node->ops_priv;
 
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 
-    if (graph->graph_layout == TENGINE_LAYOUT_NCHW)    // nchw
+    if (graph->graph_layout == TENGINE_LAYOUT_NCHW) // nchw
     {
         op_param->batch = input_tensor->dims[0];
         op_param->in_shape[0] = input_tensor->dims[1];
         op_param->in_shape[1] = input_tensor->dims[2];
         op_param->in_shape[2] = input_tensor->dims[3];
     }
-    else    // nhwc
+    else // nhwc
     {
         op_param->batch = input_tensor->dims[0];
         op_param->in_shape[0] = input_tensor->dims[3];
@@ -238,12 +224,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 
     struct tensor* weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
 
-    if (graph->graph_layout == TENGINE_LAYOUT_NCHW)    // hw
+    if (graph->graph_layout == TENGINE_LAYOUT_NCHW) // hw
     {
         op_param->kernels[0] = weight_tensor->dims[2];
         op_param->kernels[1] = weight_tensor->dims[3];
     }
-    else    //
+    else //
     {
         op_param->kernels[0] = weight_tensor->dims[1];
         op_param->kernels[1] = weight_tensor->dims[2];
@@ -253,7 +239,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    if (graph->graph_layout == TENGINE_LAYOUT_NCHW)    // chw
+    if (graph->graph_layout == TENGINE_LAYOUT_NCHW) // chw
     {
         op_param->out_shape[0] = output_tensor->dims[1];
         op_param->out_shape[1] = output_tensor->dims[2];
@@ -272,8 +258,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     op_param->dilations[1] = param->dilation_h;
     op_param->dilations[0] = param->dilation_w;
 
-    op_param->pads[0] = param->pad_h0;    // pad_h
-    op_param->pads[1] = param->pad_w0;    // pad_w
+    op_param->pads[0] = param->pad_h0; // pad_h
+    op_param->pads[1] = param->pad_w0; // pad_w
 
     op_param->group = param->group;
     op_param->activation = param->activation;
@@ -291,9 +277,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* bias_tensor = NULL;
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    if (ir_node->input_num > 2)	
+    if (ir_node->input_num > 2)
     {
-        bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);	
+        bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     }
 
     void* output_data = output_tensor->data;
@@ -304,7 +290,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (bias_tensor != NULL)
         bias = bias_tensor->data;
 
-    struct deconv_ref_param* op_param = ( struct deconv_ref_param* )exec_node->ops_priv;
+    struct deconv_ref_param* op_param = (struct deconv_ref_param*)exec_node->ops_priv;
 
     /* input quant param */
     int ret = ref_deconv_fp32((float*)input_data, (float*)output_data, (float*)kernel, (float*)bias, op_param);
@@ -324,7 +310,7 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct deconv_ref_param* deconv_ref_param = ( struct deconv_ref_param* )sys_malloc(sizeof(struct deconv_ref_param));
+    struct deconv_ref_param* deconv_ref_param = (struct deconv_ref_param*)sys_malloc(sizeof(struct deconv_ref_param));
     exec_node->ops_priv = deconv_ref_param;
     return 0;
 }
diff --git a/source/device/cpu/op/depthtospace/depthtospace_ref.c b/source/device/cpu/op/depthtospace/depthtospace_ref.c
index cd8e0610a..940b033ce 100644
--- a/source/device/cpu/op/depthtospace/depthtospace_ref.c
+++ b/source/device/cpu/op/depthtospace/depthtospace_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 int ref_depthtospace_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     float* input_data = (float*)input_tensor->data;
diff --git a/source/device/cpu/op/detection_output/detection_output_ref.c b/source/device/cpu/op/detection_output/detection_output_ref.c
index fe6cbde25..ed9409118 100644
--- a/source/device/cpu/op/detection_output/detection_output_ref.c
+++ b/source/device/cpu/op/detection_output/detection_output_ref.c
@@ -38,7 +38,6 @@
 #include <math.h>
 #include <string.h>
 
-
 typedef struct
 {
     float x0;
@@ -155,7 +154,7 @@ void nms_sorted_bboxes(const Box_t* bboxes, int bboxes_num, int* picked, int* pi
         }
     }
 
-	sys_free(areas);
+    sys_free(areas);
 }
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
@@ -170,29 +169,29 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct node* ir_node   = exec_node->ir_node;
+    struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct tensor* loc_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* loc_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* conf_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* priorbox_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    detection_output_param_t* param = ( detection_output_param_t* )(ir_node->op.param_mem);
+    detection_output_param_t* param = (detection_output_param_t*)(ir_node->op.param_mem);
 
-    float* location   = NULL;
+    float* location = NULL;
     float* confidence = NULL;
-    float* priorbox   = NULL;
+    float* priorbox = NULL;
 
     /* use original fp32 data or dequant uint8 to fp32 */
     if (loc_tensor->data_type == TENGINE_DT_FP32)
-        location = ( float* )loc_tensor->data;
+        location = (float*)loc_tensor->data;
     else if (loc_tensor->data_type == TENGINE_DT_UINT8)
     {
         uint8_t* location_u8 = (uint8_t*)loc_tensor->data;
-        uint32_t elem_num    = loc_tensor->elem_num;
-        uint32_t zero_point  = loc_tensor->zero_point;
+        uint32_t elem_num = loc_tensor->elem_num;
+        uint32_t zero_point = loc_tensor->zero_point;
         float scale = loc_tensor->scale;
         location = (float*)sys_malloc(elem_num * sizeof(float));
-        for (int i=0; i<elem_num; i++)
+        for (int i = 0; i < elem_num; i++)
         {
             location[i] = ((float)location_u8[i] - (float)zero_point) * scale;
         }
@@ -200,25 +199,25 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     else if (loc_tensor->data_type == TENGINE_DT_INT8)
     {
         int8_t* location_i8 = (int8_t*)loc_tensor->data;
-        uint32_t elem_num   = loc_tensor->elem_num;
+        uint32_t elem_num = loc_tensor->elem_num;
         float scale = loc_tensor->scale;
         location = (float*)sys_malloc(elem_num * sizeof(float));
-        for (int i=0; i<elem_num; i++)
+        for (int i = 0; i < elem_num; i++)
         {
             location[i] = (float)location_i8[i] * scale;
         }
     }
 
     if (conf_tensor->data_type == TENGINE_DT_FP32)
-        confidence = ( float* )conf_tensor->data;
+        confidence = (float*)conf_tensor->data;
     else if (conf_tensor->data_type == TENGINE_DT_UINT8)
     {
         uint8_t* confidence_u8 = (uint8_t*)conf_tensor->data;
-        uint32_t elem_num      = conf_tensor->elem_num;
-        uint32_t zero_point    = conf_tensor->zero_point;
+        uint32_t elem_num = conf_tensor->elem_num;
+        uint32_t zero_point = conf_tensor->zero_point;
         float scale = conf_tensor->scale;
         confidence = (float*)sys_malloc(elem_num * sizeof(float));
-        for (int i=0; i<elem_num; i++)
+        for (int i = 0; i < elem_num; i++)
         {
             confidence[i] = ((float)confidence_u8[i] - (float)zero_point) * scale;
         }
@@ -226,25 +225,25 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     else if (conf_tensor->data_type == TENGINE_DT_INT8)
     {
         int8_t* confidence_i8 = (int8_t*)conf_tensor->data;
-        uint32_t elem_num     = conf_tensor->elem_num;
+        uint32_t elem_num = conf_tensor->elem_num;
         float scale = conf_tensor->scale;
         confidence = (float*)sys_malloc(elem_num * sizeof(float));
-        for (int i=0; i<elem_num; i++)
+        for (int i = 0; i < elem_num; i++)
         {
             confidence[i] = (float)confidence_i8[i] * scale;
         }
     }
 
     if (priorbox_tensor->data_type == TENGINE_DT_FP32)
-        priorbox = ( float* )priorbox_tensor->data;
+        priorbox = (float*)priorbox_tensor->data;
     else if (priorbox_tensor->data_type == TENGINE_DT_UINT8)
     {
         uint8_t* priorbox_u8 = (uint8_t*)priorbox_tensor->data;
-        uint32_t elem_num    = priorbox_tensor->elem_num;
-        uint32_t zero_point  = priorbox_tensor->zero_point;
+        uint32_t elem_num = priorbox_tensor->elem_num;
+        uint32_t zero_point = priorbox_tensor->zero_point;
         float scale = priorbox_tensor->scale;
         priorbox = (float*)sys_malloc(elem_num * sizeof(float));
-        for (int i=0; i<elem_num; i++)
+        for (int i = 0; i < elem_num; i++)
         {
             priorbox[i] = ((float)priorbox_u8[i] - (float)zero_point) * scale;
         }
@@ -252,22 +251,22 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     else if (priorbox_tensor->data_type == TENGINE_DT_INT8)
     {
         int8_t* priorbox_i8 = (int8_t*)priorbox_tensor->data;
-        uint32_t elem_num   = priorbox_tensor->elem_num;
+        uint32_t elem_num = priorbox_tensor->elem_num;
         float scale = priorbox_tensor->scale;
         priorbox = (float*)sys_malloc(elem_num * sizeof(float));
-        for (int i=0; i<elem_num; i++)
+        for (int i = 0; i < elem_num; i++)
         {
             priorbox[i] = (float)priorbox_i8[i] * scale;
         }
     }
 
     const int num_priorx4 = priorbox_tensor->dims[2];
-    const int num_prior   = num_priorx4 / 4;
+    const int num_prior = num_priorx4 / 4;
     const int num_classes = param->num_classes;
 
     int b = 0;
-    float* loc_ptr   = location + b * num_priorx4;
-    float* conf_ptr  = confidence + b * num_prior * num_classes;
+    float* loc_ptr = location + b * num_priorx4;
+    float* conf_ptr = confidence + b * num_prior * num_classes;
     float* prior_ptr = priorbox + b * num_priorx4 * 2;
 
     Box_t* boxes = (Box_t*)sys_malloc(sizeof(Box_t) * num_prior);
@@ -294,7 +293,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         if (class_box_num > param->nms_top_k)
             class_box_num = param->nms_top_k;
 
-        int* picked = (int*)sys_malloc(sizeof(int) * class_box_num);    // = NULL;
+        int* picked = (int*)sys_malloc(sizeof(int) * class_box_num); // = NULL;
         int picked_num = 0;
         nms_sorted_bboxes(class_box, class_box_num, picked, &picked_num, param->nms_threshold);
 
@@ -304,14 +303,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
             push_vector_data(output_bbox_v, &class_box[z]);
         }
 
-		sys_free(picked);
-		sys_free(class_box);
+        sys_free(picked);
+        sys_free(class_box);
     }
 
-	sys_free(boxes);
+    sys_free(boxes);
 
     int total_num = get_vector_num(output_bbox_v);
-    Box_t* bbox_rects = ( Box_t* )sys_malloc(total_num * sizeof(Box_t));
+    Box_t* bbox_rects = (Box_t*)sys_malloc(total_num * sizeof(Box_t));
 
     for (int i = 0; i < total_num; i++)
         memcpy(&bbox_rects[i], get_vector_data(output_bbox_v, i), sizeof(Box_t));
@@ -328,10 +327,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     // output
     float* output_fp32 = NULL;
     if (output_tensor->data_type == TENGINE_DT_FP32)
-        output_fp32 = ( float* )output_tensor->data;
+        output_fp32 = (float*)output_tensor->data;
     else
     {
-        output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float ));
+        output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float));
     }
 
     for (int i = 0; i < num_detected; i++)
@@ -355,7 +354,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         uint32_t elem_num = output_tensor->elem_num;
         float scale = output_tensor->scale;
         uint32_t zero_point = output_tensor->zero_point;
-        for(int i=0; i<elem_num; i++)
+        for (int i = 0; i < elem_num; i++)
         {
             int udata = (int)(output_fp32[i] / scale + zero_point);
             if (udata > 255)
@@ -377,7 +376,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         int8_t* output_i8 = (int8_t*)output_tensor->data;
         int32_t elem_num = output_tensor->elem_num;
         float scale = output_tensor->scale;
-        for(int i=0; i<elem_num; i++)
+        for (int i = 0; i < elem_num; i++)
         {
             int data_i32 = round(output_fp32[i] / scale);
             if (data_i32 > 127)
diff --git a/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c b/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c
index 9d7185cff..25b14171a 100644
--- a/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c
+++ b/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c
@@ -39,13 +39,12 @@
 #include <math.h>
 #include <string.h>
 
-
 struct Dpp_Box
 {
-    float x0;    // xmin
-    float y0;    // ymin
-    float x1;    // xmax
-    float y1;    // ymax
+    float x0; // xmin
+    float y0; // ymin
+    float x1; // xmax
+    float y1; // ymax
     int box_idx;
     int class_idx;
     float score;
@@ -68,7 +67,7 @@ struct dpp_param
 #define DPP_MAX(a, b) (a > b ? a : b)
 static float intersection_area(const struct Dpp_Box a, const struct Dpp_Box b)
 {
-    if(a.x0 > b.x1 || a.x1 < b.x0 || a.y0 > b.y1 || a.y1 < b.y0)
+    if (a.x0 > b.x1 || a.x1 < b.x0 || a.y0 > b.y1 || a.y1 < b.y0)
     {
         // no intersection
         return 0.f;
@@ -77,15 +76,15 @@ static float intersection_area(const struct Dpp_Box a, const struct Dpp_Box b)
     float inter_width = DPP_MIN(a.x1, b.x1) - DPP_MAX(a.x0, b.x0);
     float inter_height = DPP_MIN(a.y1, b.y1) - DPP_MAX(a.y0, b.y0);
 
-    return inter_width* inter_height;
+    return inter_width * inter_height;
 }
 
 static void nms_sorted_bboxes(const struct Dpp_Box* boxes, int boxes_size, int* picked, int* picked_size,
-                                     float nms_threshold)
+                              float nms_threshold)
 {
     float* areas = (float*)sys_malloc(sizeof(float) * boxes_size);
     int n_picked = 0;
-    for(int i = 0; i < boxes_size; i++)
+    for (int i = 0; i < boxes_size; i++)
     {
         float width = boxes[i].x1 - boxes[i].x0;
         float height = boxes[i].y1 - boxes[i].y0;
@@ -93,20 +92,20 @@ static void nms_sorted_bboxes(const struct Dpp_Box* boxes, int boxes_size, int*
         areas[i] = width * height;
     }
 
-    for(int i = 0; i < boxes_size; i++)
+    for (int i = 0; i < boxes_size; i++)
     {
         int keep = 1;
-        for(int j = 0; j < n_picked; j++)
+        for (int j = 0; j < n_picked; j++)
         {
             // intersection over union
             float inter_area = intersection_area(boxes[i], boxes[picked[j]]);
             float union_area = areas[i] + areas[picked[j]] - inter_area;
             // float IoU = inter_area / union_area
-            if(inter_area / union_area > nms_threshold)
+            if (inter_area / union_area > nms_threshold)
                 keep = 0;
         }
 
-        if(keep)
+        if (keep)
         {
             picked[n_picked] = i;
             n_picked++;
@@ -120,17 +119,17 @@ static void nms_sorted_bboxes(const struct Dpp_Box* boxes, int boxes_size, int*
 static void sort_boxes_by_score(struct Dpp_Box* boxes, int size)
 {
     int i, j;
-    for(i = 0; i < size - 1; i++)
+    for (i = 0; i < size - 1; i++)
     {
         int max_idx = i;
-        for(j = i + 1; j < size; j++)
+        for (j = i + 1; j < size; j++)
         {
-            if(boxes[j].score < 0.6)
+            if (boxes[j].score < 0.6)
                 continue;
-            if(boxes[max_idx].score < boxes[j].score)
+            if (boxes[max_idx].score < boxes[j].score)
                 max_idx = j;
         }
-        if(i != max_idx)
+        if (i != max_idx)
         {
             struct Dpp_Box tmp;
             memcpy(&tmp, boxes + i, sizeof(struct Dpp_Box));
@@ -139,14 +138,14 @@ static void sort_boxes_by_score(struct Dpp_Box* boxes, int size)
         }
         else
         {
-            if(boxes[max_idx].score < 0.6)
+            if (boxes[max_idx].score < 0.6)
                 return;
         }
     }
 }
 
 static int decode_single_box(struct Dpp_Box* box, const float* box_ptr, const float* anchor_ptr,
-                                    const float* scales)
+                             const float* scales)
 {
     int i = box->box_idx;
 
@@ -163,7 +162,7 @@ static int decode_single_box(struct Dpp_Box* box, const float* box_ptr, const fl
     box->x0 = xcenter - half_w;
     box->y1 = ycenter + half_h;
     box->x1 = xcenter + half_w;
-    if(box->y0 < 0 || box->x0 < 0)
+    if (box->y0 < 0 || box->x0 < 0)
         return -1;
     return 0;
 }
@@ -172,20 +171,20 @@ void get_all_boxes_rect(struct Dpp_Box* all_class_bbox_rects, const float* box,
                         const float* anchor, int num_boxes, int num_classes, float* scales)
 {
     struct Dpp_Box selected_box;
-    for(int j = 0; j < num_boxes; j++)
+    for (int j = 0; j < num_boxes; j++)
     {
-        for(int i = 1; i < num_classes; i++)
+        for (int i = 1; i < num_classes; i++)
         {
             float score = scores[j * num_classes + i];
 
-            if(score < 0.6)
+            if (score < 0.6)
                 continue;
 
             selected_box.score = score;
             selected_box.class_idx = i;
             selected_box.box_idx = j;
 
-            if(decode_single_box(&selected_box, box, anchor, scales) < 0)
+            if (decode_single_box(&selected_box, box, anchor, scales) < 0)
                 continue;
 
             // struct Box* cls_vector = all_class_bbox_rects[i];
@@ -195,39 +194,39 @@ void get_all_boxes_rect(struct Dpp_Box* all_class_bbox_rects, const float* box,
 }
 
 int ref_dpp_fp32(const float* input_f, const float* score_f, const float* anchor_f,
-                   float* detect_num, float* detect_class, float* detect_score, float* detect_boxes,struct dpp_param* param)
+                 float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, struct dpp_param* param)
 {
     const int num_classes = param->num_classes + 1;
     const int num_boxes = param->num_boxes;
     const int max_detections = param->max_detections;
 
-    struct Dpp_Box* all_boxes = ( struct Dpp_Box* )malloc((unsigned long)num_classes * num_boxes * sizeof(struct Dpp_Box));
+    struct Dpp_Box* all_boxes = (struct Dpp_Box*)malloc((unsigned long)num_classes * num_boxes * sizeof(struct Dpp_Box));
     memset(all_boxes, 0, sizeof(struct Dpp_Box) * num_classes * num_boxes);
 
     get_all_boxes_rect(all_boxes, input_f, score_f, anchor_f, num_boxes, num_classes, param->scales);
 
     int max_picked_boxes = 2 * max_detections * num_classes;
-    struct Dpp_Box* picked_boxes = ( struct Dpp_Box* )malloc(max_picked_boxes * sizeof(struct Dpp_Box));
+    struct Dpp_Box* picked_boxes = (struct Dpp_Box*)malloc(max_picked_boxes * sizeof(struct Dpp_Box));
     memset(picked_boxes, 0, sizeof(struct Dpp_Box) * max_picked_boxes);
     int all_picked_size = 0;
 
-    for(int i = 1; i < num_classes; i++)
+    for (int i = 1; i < num_classes; i++)
     {
         struct Dpp_Box* class_box = all_boxes + i * num_boxes;
 
         // sort
         sort_boxes_by_score(class_box, num_boxes);
         int box_size = 0;
-        for(int j = 0; j < num_boxes; j++)
+        for (int j = 0; j < num_boxes; j++)
         {
-            if(class_box[j].score < 0.6)
+            if (class_box[j].score < 0.6)
                 break;
             box_size++;
         }
-        if(box_size == 0)
+        if (box_size == 0)
             continue;
 
-        if(box_size > max_detections * 2)
+        if (box_size > max_detections * 2)
             box_size = max_detections * 2;
 
         int* picked = (int*)sys_malloc(sizeof(int) * num_boxes);
@@ -237,7 +236,7 @@ int ref_dpp_fp32(const float* input_f, const float* score_f, const float* anchor
         nms_sorted_bboxes(class_box, box_size, picked, &picked_size, param->nms_iou_threshold);
 
         // save the survivors
-        for(int j = 0; j < picked_size; j++)
+        for (int j = 0; j < picked_size; j++)
         {
             int z = picked[j];
             memcpy(picked_boxes + all_picked_size, class_box + z, sizeof(struct Dpp_Box));
@@ -248,13 +247,13 @@ int ref_dpp_fp32(const float* input_f, const float* score_f, const float* anchor
     }
 
     sort_boxes_by_score(picked_boxes, max_picked_boxes);
-    if(all_picked_size > max_detections)
+    if (all_picked_size > max_detections)
         all_picked_size = max_detections;
 
     // generate output tensors
     detect_num[0] = all_picked_size;
 
-    for(int i = 0; i < all_picked_size; i++)
+    for (int i = 0; i < all_picked_size; i++)
     {
         detect_class[i] = picked_boxes[i].class_idx;
         detect_score[i] = picked_boxes[i].score;
@@ -271,7 +270,7 @@ int ref_dpp_fp32(const float* input_f, const float* score_f, const float* anchor
 }
 
 int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anchor,
-                 float* detect_num, float* detect_class, float* detect_score, float* detect_boxes,struct dpp_param* param)
+                  float* detect_num, float* detect_class, float* detect_score, float* detect_boxes, struct dpp_param* param)
 {
     const int num_classes = param->num_classes + 1;
     const int num_boxes = param->num_boxes;
@@ -280,43 +279,43 @@ int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anc
     /* transform uint8_t to fp32 */
     int input_size = num_boxes * 4;
     int score_size = num_boxes * num_classes;
-    float* input_f = (float* )malloc(input_size * sizeof(float));
-    float* score_f = (float* )malloc(score_size * sizeof(float));
-    float* anchor_f = (float* )malloc(input_size * sizeof(float));
-    for(int i=0; i<input_size; i++)
+    float* input_f = (float*)malloc(input_size * sizeof(float));
+    float* score_f = (float*)malloc(score_size * sizeof(float));
+    float* anchor_f = (float*)malloc(input_size * sizeof(float));
+    for (int i = 0; i < input_size; i++)
         input_f[i] = (input[i] - param->zero[0]) * param->quant_scale[0];
-    for(int i=0; i<score_size; i++)
+    for (int i = 0; i < score_size; i++)
         score_f[i] = score[i] * param->quant_scale[1];
-    for(int i=0; i<input_size; i++)
+    for (int i = 0; i < input_size; i++)
         anchor_f[i] = (anchor[i] - param->zero[2]) * param->quant_scale[2];
 
-    struct Dpp_Box* all_boxes = (struct Dpp_Box* )malloc((unsigned long)num_classes * num_boxes * sizeof(struct Dpp_Box));
+    struct Dpp_Box* all_boxes = (struct Dpp_Box*)malloc((unsigned long)num_classes * num_boxes * sizeof(struct Dpp_Box));
     memset(all_boxes, 0, sizeof(struct Dpp_Box) * num_classes * num_boxes);
 
     get_all_boxes_rect(all_boxes, input_f, score_f, anchor_f, num_boxes, num_classes, param->scales);
 
     int max_picked_boxes = 2 * max_detections * num_classes;
-    struct Dpp_Box* picked_boxes = ( struct Dpp_Box* )malloc(max_picked_boxes * sizeof(struct Dpp_Box));
+    struct Dpp_Box* picked_boxes = (struct Dpp_Box*)malloc(max_picked_boxes * sizeof(struct Dpp_Box));
     memset(picked_boxes, 0, sizeof(struct Dpp_Box) * max_picked_boxes);
     int all_picked_size = 0;
 
-    for(int i = 1; i < num_classes; i++)
+    for (int i = 1; i < num_classes; i++)
     {
         struct Dpp_Box* class_box = all_boxes + i * num_boxes;
 
         // sort
         sort_boxes_by_score(class_box, num_boxes);
         int box_size = 0;
-        for(int j = 0; j < num_boxes; j++)
+        for (int j = 0; j < num_boxes; j++)
         {
-            if(class_box[j].score < 0.6)
+            if (class_box[j].score < 0.6)
                 break;
             box_size++;
         }
-        if(box_size == 0)
+        if (box_size == 0)
             continue;
 
-        if(box_size > max_detections * 2)
+        if (box_size > max_detections * 2)
             box_size = max_detections * 2;
 
         int* picked = (int*)sys_malloc(sizeof(int) * num_boxes);
@@ -326,7 +325,7 @@ int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anc
         nms_sorted_bboxes(class_box, box_size, picked, &picked_size, param->nms_iou_threshold);
 
         // save the survivors
-        for(int j = 0; j < picked_size; j++)
+        for (int j = 0; j < picked_size; j++)
         {
             int z = picked[j];
             memcpy(picked_boxes + all_picked_size, class_box + z, sizeof(struct Dpp_Box));
@@ -337,13 +336,13 @@ int ref_dpp_uint8(const uint8_t* input, const uint8_t* score, const uint8_t* anc
     }
 
     sort_boxes_by_score(picked_boxes, max_picked_boxes);
-    if(all_picked_size > max_detections)
+    if (all_picked_size > max_detections)
         all_picked_size = max_detections;
 
     // generate output tensors
     detect_num[0] = all_picked_size;
 
-    for(int i = 0; i < all_picked_size; i++)
+    for (int i = 0; i < all_picked_size; i++)
     {
         detect_class[i] = picked_boxes[i].class_idx;
         detect_score[i] = picked_boxes[i].score;
@@ -373,7 +372,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct detection_postprocess_param* param_ = (struct detection_postprocess_param* )ir_node->op.param_mem;
+    struct detection_postprocess_param* param_ = (struct detection_postprocess_param*)ir_node->op.param_mem;
 
     param.max_classes_per_detection = param_->max_classes_per_detection;
     param.nms_iou_threshold = param_->nms_iou_threshold;
@@ -386,8 +385,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     param.scales[2] = param_->scales[2];
     param.scales[3] = param_->scales[3];
 
-    if(input_tensor->data_type != TENGINE_DT_FP32 && input_tensor->data_type != TENGINE_DT_FP16 &&
-       input_tensor->data_type != TENGINE_DT_UINT8)
+    if (input_tensor->data_type != TENGINE_DT_FP32 && input_tensor->data_type != TENGINE_DT_FP16 && input_tensor->data_type != TENGINE_DT_UINT8)
     {
         TLOG_ERR("Not support!");
         return -1;
@@ -401,7 +399,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
 
-    struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param* )ir_node->op.param_mem;
+    struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)ir_node->op.param_mem;
 
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     const void* input_data = input_tensor->data;
@@ -433,11 +431,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (input_tensor->dim_num == 3 && input_tensor->elem_size == 1)
     {
         int in_ch = input_tensor->dims[1];
-        int in_w  = input_tensor->dims[2];
+        int in_w = input_tensor->dims[2];
         int in_size = input_tensor->elem_num;
 
         int score_ch = score->dims[1];
-        int score_w  = score->dims[2];
+        int score_w = score->dims[2];
         int score_size = score->elem_num;
 
         uint8_t* input_uint8 = (uint8_t*)input_tensor->data;
@@ -449,13 +447,13 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         memcpy(score_uint8_temp, score_uint8, score_size);
 
         int index = 0;
-        for(int w = 0; w < in_w; w++)
-            for(int c = 0; c < in_ch; c++)
+        for (int w = 0; w < in_w; w++)
+            for (int c = 0; c < in_ch; c++)
                 input_uint8[index++] = input_uint8_temp[c * in_w + w];
 
         index = 0;
-        for(int w = 0; w < score_w; w++)
-            for(int c = 0; c < score_ch; c++)
+        for (int w = 0; w < score_w; w++)
+            for (int c = 0; c < score_ch; c++)
                 score_uint8[index++] = score_uint8_temp[c * score_w + w];
 
         free(input_uint8_temp);
@@ -464,29 +462,29 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     else
     {
         int in_ch = input_tensor->dims[1];
-        int in_w  = input_tensor->dims[2];
+        int in_w = input_tensor->dims[2];
         int in_size = input_tensor->elem_num;
 
         int score_ch = score->dims[1];
-        int score_w  = score->dims[2];
+        int score_w = score->dims[2];
         int score_size = score->elem_num;
 
         float* input_fp32 = (float*)input_tensor->data;
         float* score_fp32 = (float*)score->data;
-        float* input_fp32_temp = (float*)malloc(in_size*sizeof(float));
-        float* score_fp32_temp = (float*)malloc(score_size*sizeof(float));
+        float* input_fp32_temp = (float*)malloc(in_size * sizeof(float));
+        float* score_fp32_temp = (float*)malloc(score_size * sizeof(float));
 
         memcpy(input_fp32_temp, input_fp32, in_size);
         memcpy(score_fp32_temp, score_fp32, score_size);
 
         int index = 0;
-        for(int w = 0; w < in_w; w++)
-            for(int c = 0; c < in_ch; c++)
+        for (int w = 0; w < in_w; w++)
+            for (int c = 0; c < in_ch; c++)
                 input_fp32[index++] = input_fp32_temp[c * in_w + w];
 
         index = 0;
-        for(int w = 0; w < score_w; w++)
-            for(int c = 0; c < score_ch; c++)
+        for (int w = 0; w < score_w; w++)
+            for (int c = 0; c < score_ch; c++)
                 score_fp32[index++] = score_fp32_temp[c * score_w + w];
 
         free(input_fp32_temp);
@@ -494,11 +492,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     }
 
     if (input_tensor->data_type == TENGINE_DT_FP32)
-        ref_dpp_fp32((float*)input_data, (float*)score_data, (float*)anchor_data, detect_num_data, 
-            detect_classes_data, detect_scores_data, detect_boxes_data, &param);
+        ref_dpp_fp32((float*)input_data, (float*)score_data, (float*)anchor_data, detect_num_data,
+                     detect_classes_data, detect_scores_data, detect_boxes_data, &param);
     else
-        ref_dpp_uint8((uint8_t*)input_data, (uint8_t*)score_data, (uint8_t*)anchor_data, detect_num_data, 
-            detect_classes_data, detect_scores_data, detect_boxes_data, &param);
+        ref_dpp_uint8((uint8_t*)input_data, (uint8_t*)score_data, (uint8_t*)anchor_data, detect_num_data,
+                      detect_classes_data, detect_scores_data, detect_boxes_data, &param);
 
     return 0;
 }
@@ -518,12 +516,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 static struct node_ops detection_postprocess_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+                                                         .run = run,
+                                                         .reshape = NULL,
+                                                         .postrun = NULL,
+                                                         .init_node = init_node,
+                                                         .release_node = release_node,
+                                                         .score = score};
 
 int register_detection_postprocess_ref_op()
 {
diff --git a/source/device/cpu/op/dropout/dropout_ref.c b/source/device/cpu/op/dropout/dropout_ref.c
index dd6e32075..144663971 100644
--- a/source/device/cpu/op/dropout/dropout_ref.c
+++ b/source/device/cpu/op/dropout/dropout_ref.c
@@ -32,7 +32,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     exec_node->inplace_map[0] = 0;
diff --git a/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c b/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c
index 99c2b3c33..01f57de9f 100644
--- a/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c
+++ b/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.c
@@ -34,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -54,7 +53,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor0 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct eltwise_param* eltwise_param = ( struct eltwise_param* )ir_node->op.param_mem;
+    struct eltwise_param* eltwise_param = (struct eltwise_param*)ir_node->op.param_mem;
 
     struct tensor* input_tensor1 = NULL;
     if (ir_node->input_num > 1)
@@ -79,7 +78,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 
     input_tensor_0 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     input_tensor_1 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
-    struct eltwise_param* eltwise_param = ( struct eltwise_param* )ir_node->op.param_mem;
+    struct eltwise_param* eltwise_param = (struct eltwise_param*)ir_node->op.param_mem;
 
     if (input_tensor_0->data_type != TENGINE_DT_FP32 || ir_graph->graph_layout != TENGINE_LAYOUT_NCHW)
         return 0;
diff --git a/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h b/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h
index 95ba897ef..ac2dba2e2 100644
--- a/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h
+++ b/source/device/cpu/op/eltwise/cortex-a/eltwise_hcl_arm.h
@@ -31,7 +31,6 @@
 
 #include <arm_neon.h>
 
-
 int perf_eltwise_fp32(struct tensor* output_tensor, struct tensor* input_tensor0, struct tensor* input_tensor1,
                       struct eltwise_param* eltwise_param, int num_thread)
 {
diff --git a/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.c b/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.c
index 13620556d..8e341742f 100644
--- a/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.c
+++ b/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.c
@@ -29,7 +29,6 @@
 
 #include <arm_neon.h>
 
-
 static int kernel_run(float* output, float* input0, float* input1, int type, int in_size0, int in_size1, int stride)
 {
     float* out_ptr = output;
@@ -39,51 +38,51 @@ static int kernel_run(float* output, float* input0, float* input1, int type, int
 
     switch (type)
     {
-        case ELT_SUM:
-            if (in_size0 == 1)
+    case ELT_SUM:
+        if (in_size0 == 1)
+        {
+            float32x4_t data0 = vdupq_n_f32(in0[0]);
+            for (int i = 0; i < in_size1; i = i + 4)
+            {
+                float32x4_t data1 = vld1q_f32(in1 + i);
+                float32x4_t sum = vaddq_f32(data0, data1);
+                vst1q_f32(out_ptr + i, sum);
+            }
+            loop_time = in_size1 / 4;
+            for (int i = loop_time * 4; i < in_size1; i++)
             {
-                float32x4_t data0 = vdupq_n_f32(in0[0]);
-                for (int i = 0; i < in_size1; i = i + 4)
-                {
-                    float32x4_t data1 = vld1q_f32(in1 + i);
-                    float32x4_t sum = vaddq_f32(data0, data1);
-                    vst1q_f32(out_ptr + i, sum);
-                }
-                loop_time = in_size1 / 4;
-                for (int i = loop_time * 4; i < in_size1; i++)
-                {
-                    out_ptr[i] = in1[i] + in0[0];
-                }
+                out_ptr[i] = in1[i] + in0[0];
             }
-            else if (in_size1 == in_size0)
+        }
+        else if (in_size1 == in_size0)
+        {
+            for (int i = 0; i < in_size1; i = i + 4)
+            {
+                float32x4_t data0 = vld1q_f32(in0 + i);
+                float32x4_t data1 = vld1q_f32(in1 + i);
+                float32x4_t sum = vaddq_f32(data0, data1);
+                vst1q_f32(out_ptr + i, sum);
+            }
+            loop_time = in_size1 / 4;
+
+            for (int i = loop_time * 4; i < in_size1; i++)
             {
-                for (int i = 0; i < in_size1; i = i + 4)
-                {
-                    float32x4_t data0 = vld1q_f32(in0 + i);
-                    float32x4_t data1 = vld1q_f32(in1 + i);
-                    float32x4_t sum = vaddq_f32(data0, data1);
-                    vst1q_f32(out_ptr + i, sum);
-                }
-                loop_time = in_size1 / 4;
-
-                for (int i = loop_time * 4; i < in_size1; i++)
-                {
-                    out_ptr[i] = in1[i] + in0[i];
-                }
+                out_ptr[i] = in1[i] + in0[i];
             }
-            else if (in_size0 < in_size1 && in_size0 != 1)
+        }
+        else if (in_size0 < in_size1 && in_size0 != 1)
+        {
+            for (int i = 0; i < in_size1; ++i)
             {
-                for (int i = 0; i < in_size1; ++i)
-                {
-                    *out_ptr++ = in1[i] + in0[i / stride];
-                }
+                *out_ptr++ = in1[i] + in0[i / stride];
             }
-            else
-                return -1;
-            break;
+        }
+        else
+            return -1;
+        break;
 
-        default:
-            break;
+    default:
+        break;
     }
 
     return 0;
@@ -93,7 +92,7 @@ int eltwise_run(struct tensor* output_tensor, struct tensor* input_tensor0, stru
                 struct eltwise_param* eltwise_param, int num_thread)
 {
     // input
-    float* input0 = ( float* )input_tensor0->data;
+    float* input0 = (float*)input_tensor0->data;
     int in_size0 = input_tensor0->elem_num;
 
     float* input1 = NULL;
@@ -101,7 +100,7 @@ int eltwise_run(struct tensor* output_tensor, struct tensor* input_tensor0, stru
 
     if (input_tensor1)
     {
-        input1 = ( float* )input_tensor1->data;
+        input1 = (float*)input_tensor1->data;
         in_size1 = input_tensor1->elem_num;
     }
 
@@ -133,7 +132,7 @@ int eltwise_run(struct tensor* output_tensor, struct tensor* input_tensor0, stru
     // int input_number=node->GetInputNum();
 
     // output
-    float* output = ( float* )output_tensor->data;
+    float* output = (float*)output_tensor->data;
     int result = 0;
 
     int stride = input_tensor_tmp->dims[2] * input_tensor_tmp->dims[3];
diff --git a/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.h b/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.h
index ab78cc214..3bb1e3f05 100644
--- a/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.h
+++ b/source/device/cpu/op/eltwise/cortex-a/eltwise_kernel_arm.h
@@ -28,7 +28,6 @@
 
 #include "graph/tensor.h"
 
-
 int eltwise_run(struct tensor* output_tensor, struct tensor* input_tensor0, struct tensor* input_tensor1,
                 struct eltwise_param* eltwise_param, int num_thread);
 
diff --git a/source/device/cpu/op/eltwise/eltwise_ref.c b/source/device/cpu/op/eltwise/eltwise_ref.c
index 0b6f14bf2..58c901056 100644
--- a/source/device/cpu/op/eltwise/eltwise_ref.c
+++ b/source/device/cpu/op/eltwise/eltwise_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 struct eltwise_op_param
 {
     float scale[3];
@@ -49,214 +48,234 @@ struct eltwise_op_param
 static int ref_eltwise_fp32(void* output, void* input0, void* input1, int type, int input_count4, int input_chan,
                             int input_hw, int input1_count4, int num_thread, int input_hw_1, struct eltwise_param* eltwise_param)
 {
-    float* out_ptr = ( float* )output;
-    float* in0 = ( float* )input0;
-    float* in1 = ( float* )input1;
+    float* out_ptr = (float*)output;
+    float* in0 = (float*)input0;
+    float* in1 = (float*)input1;
 
     switch (type)
     {
-        case ELT_SUB:
-            if (input1_count4 == 1)
-            {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = (*in0++) - in1[0];
-                }
-            }
-            else if (input_count4 == input1_count4)
+    case ELT_SUB:
+        if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = (*in0++) - (*in1++);
-                }
+                *out_ptr++ = (*in0++) - in1[0];
             }
-            else if (input_chan == input1_count4)
+        }
+        else if (input_count4 == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = in0[i] - in1[i / input_hw];
-                }
+                *out_ptr++ = (*in0++) - (*in1++);
             }
-            else
-                return -1;
-            break;
-        case ELT_SUM:
-            if (input1_count4 == 1)
+        }
+        else if (input_chan == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = (*in0++) + in1[0];
-                }
+                *out_ptr++ = in0[i] - in1[i / input_hw];
             }
-            else if (input_count4 == input1_count4)
+        }
+        else
+            return -1;
+        break;
+    case ELT_SUM:
+        if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = (*in0++) + (*in1++);
-                }
+                *out_ptr++ = (*in0++) + in1[0];
             }
-            else if (input_chan == input1_count4)
+        }
+        else if (input_count4 == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = in0[i] + in1[i / input_hw];
-                }
+                *out_ptr++ = (*in0++) + (*in1++);
             }
-            else if(input_hw == input_hw_1){
-                for( int i = 0; i < input_chan; i++){
-                    for(int j = 0; j < input_hw; j++){
-                        *out_ptr++ = in0[i*input_hw + j] + in1[j];
-                    }
-                }
-                // TLOG_ERR("%d %d \n", input1_count4, input_chan);
-            }
-            else
-                return -1;
-            break;
-        case ELT_MAX:
+        }
+        else if (input_chan == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                *out_ptr++ = ELT_MAX(in0[i], in1[i]);
+                *out_ptr++ = in0[i] + in1[i / input_hw];
             }
-            break;
-        case ELT_PROD:
-            if (input1_count4 == 1)
+        }
+        else if (input_hw == input_hw_1)
+        {
+            for (int i = 0; i < input_chan; i++)
             {
-                for (int i = 0; i < input_count4; ++i)
+                for (int j = 0; j < input_hw; j++)
                 {
-                    *out_ptr++ = (*in0++) * in1[0];
+                    *out_ptr++ = in0[i * input_hw + j] + in1[j];
                 }
             }
-            else if (input_count4 == input1_count4)
+            // TLOG_ERR("%d %d \n", input1_count4, input_chan);
+        }
+        else
+            return -1;
+        break;
+    case ELT_MAX:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            *out_ptr++ = ELT_MAX(in0[i], in1[i]);
+        }
+        break;
+    case ELT_PROD:
+        if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = in0[i] * in1[i];
-                }
+                *out_ptr++ = (*in0++) * in1[0];
             }
-            else if(input_count4 == 1)
+        }
+        else if (input_count4 == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for(int i = 0; i < input1_count4; ++i)
-                {
-                    *out_ptr++ = (in1[i]) * in0[0];
-                }
+                *out_ptr++ = in0[i] * in1[i];
             }
-            else if (input_chan == input1_count4)
+        }
+        else if (input_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = in0[i] * in1[i / input_hw];
-                }
-            }
-            else if (input_chan == input_count4){
-                for(int i = 0; i < input1_count4; i++)
-                {
-                    *out_ptr++ = in0[i/input_hw] * in1[i];
-                }
+                *out_ptr++ = (in1[i]) * in0[0];
             }
-            else
-                return -1;
-            break;
-        case ELT_RSQRT:
+        }
+        else if (input_chan == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                *out_ptr++ = 1 / sqrt(in0[i]);
+                *out_ptr++ = in0[i] * in1[i / input_hw];
             }
-            break;
-        case ELT_MIN_SCALAR:
-            for (int i = 0; i < input_count4; ++i)
+        }
+        else if (input_chan == input_count4)
+        {
+            for (int i = 0; i < input1_count4; i++)
             {
-                *out_ptr++ = ELT_MIN((*in0++), in1[0]);
+                *out_ptr++ = in0[i / input_hw] * in1[i];
             }
-            break;
-        case ELT_SUB_SCALAR:
+        }
+        else
+            return -1;
+        break;
+    case ELT_RSQRT:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            *out_ptr++ = 1 / sqrt(in0[i]);
+        }
+        break;
+    case ELT_MIN_SCALAR:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            *out_ptr++ = ELT_MIN((*in0++), in1[0]);
+        }
+        break;
+    case ELT_SUB_SCALAR:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            *out_ptr++ = (*in0++) - in1[0];
+        }
+        break;
+    case ELT_PROD_SCALAR:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            *out_ptr++ = (*in0++) * in1[0];
+        }
+        break;
+    case ELT_DIV:
+        if (input1_count4 == 1)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                *out_ptr++ = (*in0++) - in1[0];
+                *out_ptr++ = in0[i] / in1[0];
             }
-            break;
-        case ELT_PROD_SCALAR:
+        }
+        else if (input_count4 == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                *out_ptr++ = (*in0++) * in1[0];
+                *out_ptr++ = in0[i] / in1[i];
             }
-            break;
-        case ELT_DIV:
-            if (input1_count4 == 1)
+        }
+        else if (input_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = in0[i] / in1[0];
-                }
+                *out_ptr++ = in0[0] / (*in1++);
             }
-            else if (input_count4 == input1_count4)
+        }
+        else
+        {
+            break;
+        }
+        break;
+    case ELT_POW:
+        if (input_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; i++)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    *out_ptr++ = in0[i] / in1[i];
-                }
+                *out_ptr++ = powf(in0[0], in1[i]);
             }
-            else if (input_count4 == 1)
+        }
+        else if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; i++)
             {
-                for (int i = 0; i < input1_count4; ++i)
-                {
-                    *out_ptr++ = in0[0] / (*in1++);
-                }
+                *out_ptr++ = powf(in0[0], in1[i]);
             }
-            else
+        }
+        else if (input_count4 == input1_count4)
+        {
+            for (int i = 0; i < input_count4; i++)
             {
-                break;
-            }
-            break;
-        case ELT_POW:
-            if(input_count4 == 1){
-                for(int i = 0; i < input1_count4; i++){
-                    *out_ptr++ = powf(in0[0], in1[i]);
-                }
-            } else if (input1_count4 == 1){
-                for(int i = 0; i < input1_count4; i++){
-                    *out_ptr++ = powf(in0[0], in1[i]);
-                }
-            } else if (input_count4 == input1_count4){
-                for(int i = 0; i < input_count4; i++){
-                    *out_ptr++ = powf(in0[i], in1[i]);
-                }
-            } else {
-                TLOG_ERR("Case not support \n");
-            }
-            break;
-        case ELT_POWER:
-            for(int i = 0; i < input_count4; i++){
-                *out_ptr++ = powf((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power);
-            }
-            break;
-        case ELT_LOG:
-            for(int i = 0; i < input_count4; i++){
-                *out_ptr++ = log(in0[i]);
-            }
-            break;
-        case ELT_EXP:
-            for(int i = 0; i < input_count4; i++){
-                *out_ptr++ = exp(in0[i]);
-            }
-            break;
-        case ELT_SQRT:
-            for(int i = 0; i < input_count4; i++){
-                *out_ptr++ = sqrt(in0[i]);
-            }
-            break;
-        case ELT_FLOOR:
-            for(int i = 0; i < input_count4; i++){
-                *out_ptr++ = floor(in0[i]);
-            }
-            break;
-        case ELT_SQUARE:
-            for(int i = 0; i < input_count4; i++){
-                *out_ptr++ = pow(in0[i], 2);
+                *out_ptr++ = powf(in0[i], in1[i]);
             }
-            break;
-        default:
-            break;
+        }
+        else
+        {
+            TLOG_ERR("Case not support \n");
+        }
+        break;
+    case ELT_POWER:
+        for (int i = 0; i < input_count4; i++)
+        {
+            *out_ptr++ = powf((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power);
+        }
+        break;
+    case ELT_LOG:
+        for (int i = 0; i < input_count4; i++)
+        {
+            *out_ptr++ = log(in0[i]);
+        }
+        break;
+    case ELT_EXP:
+        for (int i = 0; i < input_count4; i++)
+        {
+            *out_ptr++ = exp(in0[i]);
+        }
+        break;
+    case ELT_SQRT:
+        for (int i = 0; i < input_count4; i++)
+        {
+            *out_ptr++ = sqrt(in0[i]);
+        }
+        break;
+    case ELT_FLOOR:
+        for (int i = 0; i < input_count4; i++)
+        {
+            *out_ptr++ = floor(in0[i]);
+        }
+        break;
+    case ELT_SQUARE:
+        for (int i = 0; i < input_count4; i++)
+        {
+            *out_ptr++ = pow(in0[i], 2);
+        }
+        break;
+    default:
+        break;
     }
 
     return 0;
@@ -266,9 +285,9 @@ static int ref_eltwise_uint8(struct tensor* output_tensor, struct tensor* input_
                              struct tensor* input_tensor1, int type, int input_count4, int input_chan, int input_hw,
                              int input1_count4, int num_thread, int input_hw_1, struct eltwise_param* eltwise_param)
 {
-    uint8_t* input0_uint8 = ( uint8_t* )input_tensor0->data;
+    uint8_t* input0_uint8 = (uint8_t*)input_tensor0->data;
     uint8_t* input1_uint8 = NULL;
-    uint8_t* output_uint8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
 
     float in_scale0 = input_tensor0->scale;
     float in_scale1 = 0.f;
@@ -278,216 +297,234 @@ static int ref_eltwise_uint8(struct tensor* output_tensor, struct tensor* input_
     int out_zero = output_tensor->zero_point;
 
     /* input dequant */
-    float* in0 = ( float* )sys_malloc(input_tensor0->elem_num * sizeof(float));
+    float* in0 = (float*)sys_malloc(input_tensor0->elem_num * sizeof(float));
     float* in1 = NULL;
-    float* out_ptr = ( float* )sys_malloc(output_tensor->elem_num * sizeof(float));
+    float* out_ptr = (float*)sys_malloc(output_tensor->elem_num * sizeof(float));
 
     for (int i = 0; i < input_tensor0->elem_num; i++)
         in0[i] = (input0_uint8[i] - in_zero0) * in_scale0;
 
     if (input_tensor1 != NULL)
     {
-        input1_uint8 = ( uint8_t* )input_tensor1->data;
+        input1_uint8 = (uint8_t*)input_tensor1->data;
         in_scale1 = input_tensor1->scale;
         in_zero1 = input_tensor1->zero_point;
-        in1 = ( float* )sys_malloc(input_tensor1->elem_num * sizeof(float));
+        in1 = (float*)sys_malloc(input_tensor1->elem_num * sizeof(float));
         for (int i = 0; i < input_tensor1->elem_num; i++)
             in1[i] = (input1_uint8[i] - in_zero1) * in_scale1;
     }
     /* eltwise operator */
     switch (type)
     {
-        case ELT_SUB:
-            if (input_count4 == input1_count4)
-            {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] - in1[i];
-                }
-            }
-            else if (input_chan == input1_count4)
-            {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] - in1[i / input_hw];
-                }
-            }
-            else if (input1_count4 == 1)
+    case ELT_SUB:
+        if (input_count4 == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] - in1[0];
-                }
+                out_ptr[i] = in0[i] - in1[i];
             }
-            else
-                return -1;
-            break;
-        case ELT_SUM:
-            if (input1_count4 == 1)
+        }
+        else if (input_chan == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] + in1[0];
-                }
+                out_ptr[i] = in0[i] - in1[i / input_hw];
             }
-            else if (input_count4 == input1_count4)
+        }
+        else if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] + in1[i];
-                }
+                out_ptr[i] = in0[i] - in1[0];
             }
-            else if (input_chan == input1_count4)
+        }
+        else
+            return -1;
+        break;
+    case ELT_SUM:
+        if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] + in1[i / input_hw];
-                }
+                out_ptr[i] = in0[i] + in1[0];
             }
-            else if(input_hw == input_hw_1){
-                for( int i = 0; i < input_chan; i++){
-                    for(int j = 0; j < input_hw; j++){
-                        out_ptr[i] = in0[i*input_hw + j] + in1[j];
-                    }
-                }
-            }
-            else
-                return -1;
-            break;
-        case ELT_MAX:
+        }
+        else if (input_count4 == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = ELT_MAX(in0[i], in1[i]);
+                out_ptr[i] = in0[i] + in1[i];
             }
-            break;
-        case ELT_PROD:
-            if (input1_count4 == 1)
+        }
+        else if (input_chan == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] * in1[0];
-                }
+                out_ptr[i] = in0[i] + in1[i / input_hw];
             }
-            else if (input_count4 == input1_count4)
+        }
+        else if (input_hw == input_hw_1)
+        {
+            for (int i = 0; i < input_chan; i++)
             {
-                for (int i = 0; i < input_count4; ++i)
+                for (int j = 0; j < input_hw; j++)
                 {
-                    out_ptr[i] = in0[i] * in1[i];
+                    out_ptr[i] = in0[i * input_hw + j] + in1[j];
                 }
             }
-            else if (input_chan == input1_count4)
+        }
+        else
+            return -1;
+        break;
+    case ELT_MAX:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = ELT_MAX(in0[i], in1[i]);
+        }
+        break;
+    case ELT_PROD:
+        if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] * in1[i / input_hw];
-                }
+                out_ptr[i] = in0[i] * in1[0];
             }
-            else
-                return -1;
-            break;
-        case ELT_RSQRT:
+        }
+        else if (input_count4 == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = 1 / sqrt(in0[i]);
+                out_ptr[i] = in0[i] * in1[i];
             }
-            break;
-        case ELT_MIN_SCALAR:
+        }
+        else if (input_chan == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = ELT_MIN(in0[i], in1[0]);
+                out_ptr[i] = in0[i] * in1[i / input_hw];
             }
-            break;
-        case ELT_SUB_SCALAR:
+        }
+        else
+            return -1;
+        break;
+    case ELT_RSQRT:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = 1 / sqrt(in0[i]);
+        }
+        break;
+    case ELT_MIN_SCALAR:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = ELT_MIN(in0[i], in1[0]);
+        }
+        break;
+    case ELT_SUB_SCALAR:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = in0[i] - in1[0];
+        }
+        break;
+    case ELT_PROD_SCALAR:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = in0[i] * in1[0];
+        }
+        break;
+    case ELT_DIV:
+        if (input1_count4 == 1)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = in0[i] - in1[0];
+                out_ptr[i] = in0[i] / in1[0];
             }
-            break;
-        case ELT_PROD_SCALAR:
+        }
+        else if (input_count4 == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = in0[i] * in1[0];
+                out_ptr[i] = in0[i] / in1[i];
             }
-            break;
-        case ELT_DIV:
-            if (input1_count4 == 1)
+        }
+        else if (input_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] / in1[0];
-                }
+                out_ptr[i] = in0[0] / in1[i];
             }
-            else if (input_count4 == input1_count4)
+        }
+        else
+        {
+            break;
+        }
+        break;
+    case ELT_POW:
+        if (input_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; i++)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] / in1[i];
-                }
+                out_ptr[i] = pow(in0[0], in1[i]);
             }
-            else if (input_count4 == 1)
+        }
+        else if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; i++)
             {
-                for (int i = 0; i < input1_count4; ++i)
-                {
-                    out_ptr[i] = in0[0] / in1[i];
-                }
+                out_ptr[i] = pow(in0[0], in1[i]);
             }
-            else
+        }
+        else if (input_count4 == input1_count4)
+        {
+            for (int i = 0; i < input_count4; i++)
             {
-                break;
-            }
-            break;
-        case ELT_POW:
-            if(input_count4 == 1){
-                for(int i = 0; i < input1_count4; i++){
-                    out_ptr[i] = pow(in0[0], in1[i]);
-                }
-            } else if (input1_count4 == 1){
-                for(int i = 0; i < input1_count4; i++){
-                    out_ptr[i] = pow(in0[0], in1[i]);
-                }
-            } else if (input_count4 == input1_count4){
-                for(int i = 0; i < input_count4; i++){
-                    out_ptr[i] = pow(in0[i], in1[i]);
-                }
-            } else {
-                TLOG_ERR("Case not support \n");
-            }
-            break;
-        case ELT_POWER:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = pow((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power);
+                out_ptr[i] = pow(in0[i], in1[i]);
             }
-            break;
-        case ELT_LOG:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = log(in0[i]);
-            }
-            break;
-        case ELT_EXP:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = exp(in0[i]);
-            }
-            break;
-        case ELT_SQRT:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = sqrt(in0[i]);
-            }
-            break;
-        case ELT_FLOOR:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = floor(in0[i]);
-            }
-            break;
-        case ELT_SQUARE:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = pow(in0[i], 2);
-            }
-            break;
-        default:
-            break;
+        }
+        else
+        {
+            TLOG_ERR("Case not support \n");
+        }
+        break;
+    case ELT_POWER:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = pow((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power);
+        }
+        break;
+    case ELT_LOG:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = log(in0[i]);
+        }
+        break;
+    case ELT_EXP:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = exp(in0[i]);
+        }
+        break;
+    case ELT_SQRT:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = sqrt(in0[i]);
+        }
+        break;
+    case ELT_FLOOR:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = floor(in0[i]);
+        }
+        break;
+    case ELT_SQUARE:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = pow(in0[i], 2);
+        }
+        break;
+    default:
+        break;
     }
 
-
     /* output quant */
     for (int i = 0; i < output_tensor->elem_num; i++)
     {
@@ -508,228 +545,246 @@ static int ref_eltwise_uint8(struct tensor* output_tensor, struct tensor* input_
 }
 
 static int ref_eltwise_int8(struct tensor* output_tensor, struct tensor* input_tensor0,
-                             struct tensor* input_tensor1, int type, int input_count4, int input_chan, int input_hw,
-                             int input1_count4, int num_thread, int input_hw_1, struct eltwise_param* eltwise_param)
+                            struct tensor* input_tensor1, int type, int input_count4, int input_chan, int input_hw,
+                            int input1_count4, int num_thread, int input_hw_1, struct eltwise_param* eltwise_param)
 {
-    int8_t* input0_int8 = ( int8_t* )input_tensor0->data;
+    int8_t* input0_int8 = (int8_t*)input_tensor0->data;
     int8_t* input1_int8 = NULL;
-    int8_t* output_int8 = ( int8_t* )output_tensor->data;
+    int8_t* output_int8 = (int8_t*)output_tensor->data;
 
     float in_scale0 = input_tensor0->scale;
     float in_scale1 = 0.f;
     float out_scale = output_tensor->scale;
 
     /* input dequant */
-    float* in0 = ( float* )sys_malloc(input_tensor0->elem_num * sizeof(float));
+    float* in0 = (float*)sys_malloc(input_tensor0->elem_num * sizeof(float));
     float* in1 = NULL;
-    float* out_ptr = ( float* )sys_malloc(output_tensor->elem_num * sizeof(float));
+    float* out_ptr = (float*)sys_malloc(output_tensor->elem_num * sizeof(float));
 
     for (int i = 0; i < input_tensor0->elem_num; i++)
-        in0[i] = (float )input0_int8[i] * in_scale0;
+        in0[i] = (float)input0_int8[i] * in_scale0;
 
     if (input_tensor1 != NULL)
     {
-        input1_int8 = ( int8_t* )input_tensor1->data;
+        input1_int8 = (int8_t*)input_tensor1->data;
         in_scale1 = input_tensor1->scale;
-        in1 = ( float* )sys_malloc(input_tensor1->elem_num * sizeof(float));
+        in1 = (float*)sys_malloc(input_tensor1->elem_num * sizeof(float));
         for (int i = 0; i < input_tensor1->elem_num; i++)
-            in1[i] = (float )input1_int8[i] * in_scale1;
+            in1[i] = (float)input1_int8[i] * in_scale1;
     }
 
     /* eltwise operator */
     switch (type)
     {
-        case ELT_SUB:
-            if (input_count4 == input1_count4)
-            {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] - in1[i];
-                }
-            }
-            else if (input_chan == input1_count4)
-            {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] - in1[i / input_hw];
-                }
-            }
-            else if (input1_count4 == 1)
+    case ELT_SUB:
+        if (input_count4 == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] - in1[0];
-                }
+                out_ptr[i] = in0[i] - in1[i];
             }
-            else
-                return -1;
-            break;
-        case ELT_SUM:
-            if (input1_count4 == 1)
+        }
+        else if (input_chan == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] + in1[0];
-                }
+                out_ptr[i] = in0[i] - in1[i / input_hw];
             }
-            else if (input_count4 == input1_count4)
+        }
+        else if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] + in1[i];
-                }
+                out_ptr[i] = in0[i] - in1[0];
             }
-            else if (input_chan == input1_count4)
+        }
+        else
+            return -1;
+        break;
+    case ELT_SUM:
+        if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] + in1[i / input_hw];
-                }
-            }
-            else if(input_hw == input_hw_1){
-                for( int i = 0; i < input_chan; i++){
-                    for(int j = 0; j < input_hw; j++){
-                        out_ptr[i] = in0[i*input_hw + j] + in1[j];
-                    }
-                }
+                out_ptr[i] = in0[i] + in1[0];
             }
-            else
-                return -1;
-            break;
-        case ELT_MAX:
+        }
+        else if (input_count4 == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = ELT_MAX(in0[i], in1[i]);
+                out_ptr[i] = in0[i] + in1[i];
             }
-            break;
-        case ELT_PROD:
-            if (input1_count4 == 1)
+        }
+        else if (input_chan == input1_count4)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] * in1[0];
-                }
+                out_ptr[i] = in0[i] + in1[i / input_hw];
             }
-            else if (input_count4 == input1_count4)
+        }
+        else if (input_hw == input_hw_1)
+        {
+            for (int i = 0; i < input_chan; i++)
             {
-                for (int i = 0; i < input_count4; ++i)
+                for (int j = 0; j < input_hw; j++)
                 {
-                    out_ptr[i] = in0[i] * in1[i];
+                    out_ptr[i] = in0[i * input_hw + j] + in1[j];
                 }
             }
-            else if (input_chan == input1_count4)
+        }
+        else
+            return -1;
+        break;
+    case ELT_MAX:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = ELT_MAX(in0[i], in1[i]);
+        }
+        break;
+    case ELT_PROD:
+        if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] * in1[i / input_hw];
-                }
+                out_ptr[i] = in0[i] * in1[0];
             }
-            else
-                return -1;
-            break;
-        case ELT_RSQRT:
+        }
+        else if (input_count4 == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = 1 / sqrt(in0[i]);
+                out_ptr[i] = in0[i] * in1[i];
             }
-            break;
-        case ELT_MIN_SCALAR:
+        }
+        else if (input_chan == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = ELT_MIN(in0[i], in1[0]);
+                out_ptr[i] = in0[i] * in1[i / input_hw];
             }
-            break;
-        case ELT_SUB_SCALAR:
+        }
+        else
+            return -1;
+        break;
+    case ELT_RSQRT:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = 1 / sqrt(in0[i]);
+        }
+        break;
+    case ELT_MIN_SCALAR:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = ELT_MIN(in0[i], in1[0]);
+        }
+        break;
+    case ELT_SUB_SCALAR:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = in0[i] - in1[0];
+        }
+        break;
+    case ELT_PROD_SCALAR:
+        for (int i = 0; i < input_count4; ++i)
+        {
+            out_ptr[i] = in0[i] * in1[0];
+        }
+        break;
+    case ELT_DIV:
+        if (input1_count4 == 1)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = in0[i] - in1[0];
+                out_ptr[i] = in0[i] / in1[0];
             }
-            break;
-        case ELT_PROD_SCALAR:
+        }
+        else if (input_count4 == input1_count4)
+        {
             for (int i = 0; i < input_count4; ++i)
             {
-                out_ptr[i] = in0[i] * in1[0];
+                out_ptr[i] = in0[i] / in1[i];
             }
-            break;
-        case ELT_DIV:
-            if (input1_count4 == 1)
+        }
+        else if (input_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; ++i)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] / in1[0];
-                }
+                out_ptr[i] = in0[0] / in1[i];
             }
-            else if (input_count4 == input1_count4)
+        }
+        else
+        {
+            break;
+        }
+        break;
+    case ELT_POW:
+        if (input_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; i++)
             {
-                for (int i = 0; i < input_count4; ++i)
-                {
-                    out_ptr[i] = in0[i] / in1[i];
-                }
+                out_ptr[i] = pow(in0[0], in1[i]);
             }
-            else if (input_count4 == 1)
+        }
+        else if (input1_count4 == 1)
+        {
+            for (int i = 0; i < input1_count4; i++)
             {
-                for (int i = 0; i < input1_count4; ++i)
-                {
-                    out_ptr[i] = in0[0] / in1[i];
-                }
+                out_ptr[i] = pow(in0[0], in1[i]);
             }
-            else
+        }
+        else if (input_count4 == input1_count4)
+        {
+            for (int i = 0; i < input_count4; i++)
             {
-                break;
-            }
-            break;
-        case ELT_POW:
-            if(input_count4 == 1){
-                for(int i = 0; i < input1_count4; i++){
-                    out_ptr[i] = pow(in0[0], in1[i]);
-                }
-            } else if (input1_count4 == 1){
-                for(int i = 0; i < input1_count4; i++){
-                    out_ptr[i] = pow(in0[0], in1[i]);
-                }
-            } else if (input_count4 == input1_count4){
-                for(int i = 0; i < input_count4; i++){
-                    out_ptr[i] = pow(in0[i], in1[i]);
-                }
-            } else {
-                TLOG_ERR("Case not support \n");
+                out_ptr[i] = pow(in0[i], in1[i]);
             }
-            break;
-        case ELT_POWER:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = pow((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power);
-            }
-            break;
-        case ELT_LOG:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = log(in0[i]);
-            }
-            break;
-        case ELT_EXP:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = exp(in0[i]);
-            }
-            break;
-        case ELT_SQRT:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = sqrt(in0[i]);
-            }
-            break;
-        case ELT_FLOOR:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = floor(in0[i]);
-            }
-            break;
-        case ELT_SQUARE:
-            for(int i = 0; i < input_count4; i++){
-                out_ptr[i] = pow(in0[i], 2);
-            }
-            break;
-        default:
-            break;
+        }
+        else
+        {
+            TLOG_ERR("Case not support \n");
+        }
+        break;
+    case ELT_POWER:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = pow((eltwise_param->shift + eltwise_param->scale * in0[i]), eltwise_param->power);
+        }
+        break;
+    case ELT_LOG:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = log(in0[i]);
+        }
+        break;
+    case ELT_EXP:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = exp(in0[i]);
+        }
+        break;
+    case ELT_SQRT:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = sqrt(in0[i]);
+        }
+        break;
+    case ELT_FLOOR:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = floor(in0[i]);
+        }
+        break;
+    case ELT_SQUARE:
+        for (int i = 0; i < input_count4; i++)
+        {
+            out_ptr[i] = pow(in0[i], 2);
+        }
+        break;
+    default:
+        break;
     }
 
-
     /* output quant */
     for (int i = 0; i < output_tensor->elem_num; i++)
     {
@@ -774,7 +829,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor0 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct eltwise_param* eltwise_param = ( struct eltwise_param* )ir_node->op.param_mem;
+    struct eltwise_param* eltwise_param = (struct eltwise_param*)ir_node->op.param_mem;
 
     int layout = ir_graph->graph_layout;
     void* input0 = input_tensor0->data;
@@ -789,7 +844,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         input1 = input_tensor1->data;
         input1_count4 = input_tensor1->elem_num;
         int dim1_size = input_tensor1->dim_num;
-        input_hw_1 = input_tensor1->dims[dim1_size-2]*input_tensor1->dims[dim1_size-1];
+        input_hw_1 = input_tensor1->dims[dim1_size - 2] * input_tensor1->dims[dim1_size - 1];
     }
 
     if (!input_tensor1 || input_tensor0->elem_num >= input_tensor1->elem_num)
@@ -800,11 +855,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         int dim0_size = input_tensor0->dim_num;
         if (layout == TENGINE_LAYOUT_NCHW)
         {
-            input_chan_0 = input_tensor0->dims[dim0_size-3];
-            if(input_tensor0->dims[dim0_size-4]){
-                input_chan_0 *= input_tensor0->dims[dim0_size-4];
+            input_chan_0 = input_tensor0->dims[dim0_size - 3];
+            if (input_tensor0->dims[dim0_size - 4])
+            {
+                input_chan_0 *= input_tensor0->dims[dim0_size - 4];
             }
-            input_hw_0 = input_tensor0->dims[dim0_size-2] * input_tensor0->dims[dim0_size-1];
+            input_hw_0 = input_tensor0->dims[dim0_size - 2] * input_tensor0->dims[dim0_size - 1];
         }
         else if (layout == TENGINE_LAYOUT_NHWC)
         {
@@ -825,7 +881,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
                                     input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param);
         else if (input_tensor0->data_type == TENGINE_DT_INT8)
             ret = ref_eltwise_int8(output_tensor, input_tensor0, input_tensor1, eltwise_param->type, input0_count4,
-                                    input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param);
+                                   input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param);
         else
         {
             TLOG_ERR("Input data type %d not to be supported.\n", input_tensor0->data_type);
@@ -866,7 +922,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
                                     input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param);
         else if (output_tensor->data_type == TENGINE_DT_INT8)
             ret = ref_eltwise_int8(output_tensor, input_tensor1, input_tensor0, eltwise_param->type, input0_count4,
-                                    input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param);
+                                   input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread, input_hw_1, eltwise_param);
         else
         {
             TLOG_ERR("Input data type %d not to be supported.\n", output_tensor->data_type);
diff --git a/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c b/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c
index 3aca14499..1f7a7aad5 100644
--- a/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c
+++ b/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c
@@ -34,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct elu_param* elu_param = ( struct elu_param* )ir_node->op.param_mem;
+    struct elu_param* elu_param = (struct elu_param*)ir_node->op.param_mem;
 
     int num_thread = exec_graph->num_thread;
 
diff --git a/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.c b/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.c
index 78da9c64c..1125dc958 100644
--- a/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.c
+++ b/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.c
@@ -30,10 +30,9 @@
 
 #include <arm_neon.h>
 
-
 static void elu_kernel(int i, int id, void* data, const float* input, float* output, float alpha)
 {
-    int elem_num = (( int* )data)[0];
+    int elem_num = ((int*)data)[0];
     float32x4_t _one = vdupq_n_f32(1.f);
     float32x4_t _zero = vdupq_n_f32(0.f);
     float32x4_t _alpha = vdupq_n_f32(alpha);
@@ -67,8 +66,8 @@ static void elu_kernel(int i, int id, void* data, const float* input, float* out
 int elu_run(struct tensor* output_tensor, struct tensor* input_tensor, struct elu_param* elu_param,
             int num_thread)
 {
-    float* data = ( float* )input_tensor->data;
-    float* out_data = ( float* )output_tensor->data;
+    float* data = (float*)input_tensor->data;
+    float* out_data = (float*)output_tensor->data;
     float alpha = elu_param->alpha;
 
     int chan_num = (input_tensor->dims[0]) * (input_tensor->dims[1]);
diff --git a/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.h b/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.h
index 172ddf329..a3d937afe 100644
--- a/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.h
+++ b/source/device/cpu/op/elu/cortex-a/elu_kernel_arm.h
@@ -29,7 +29,6 @@
 
 #include "graph/tensor.h"
 
-
 int elu_run(struct tensor* output_tensor, struct tensor* input_tensor, struct elu_param* elu_param,
             int num_thread);
 
diff --git a/source/device/cpu/op/elu/elu_ref.c b/source/device/cpu/op/elu/elu_ref.c
index 1db10f74e..1d41d940d 100644
--- a/source/device/cpu/op/elu/elu_ref.c
+++ b/source/device/cpu/op/elu/elu_ref.c
@@ -37,7 +37,6 @@
 
 #include <math.h>
 
-
 typedef struct __elu_param
 {
     float scale;
@@ -45,7 +44,6 @@ typedef struct __elu_param
     float alpha;
 } _elu_param, *p_elu_param;
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -89,12 +87,12 @@ int ref_elu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* data = ( float* )sys_malloc(input_size * sizeof(float));
-    float* out_data = ( float* )sys_malloc(output_size * sizeof(float));
+    float* data = (float*)sys_malloc(input_size * sizeof(float));
+    float* out_data = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     for (int i = 0; i < size; i++)
@@ -108,7 +106,7 @@ int ref_elu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int
             out_data[i] = data[i];
         }
     }
-    
+
     /* quant */
     for (int i = 0; i < output_size; i++)
     {
@@ -134,7 +132,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct elu_param* param = ( struct elu_param* )node->op.param_mem;
+    struct elu_param* param = (struct elu_param*)node->op.param_mem;
 
     int elem_num = input_tensor->elem_num;
     void* in_data = input_tensor->data;
@@ -150,7 +148,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ref_elu_fp32((float*)in_data, (float*)out_data, elem_num, &op_param);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ref_elu_uint8(input_tensor, output_tensor, elem_num, &op_param);
 
     return 0;
diff --git a/source/device/cpu/op/embedding/embedding_ref.c b/source/device/cpu/op/embedding/embedding_ref.c
index 188a5aeb6..5fe920a6a 100644
--- a/source/device/cpu/op/embedding/embedding_ref.c
+++ b/source/device/cpu/op/embedding/embedding_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -62,7 +61,7 @@ int ref_embed_fp32(float* in_data, float* out_data, float* weight_data, float* b
             word_index = 0;
         if (word_index >= input_dim)
             word_index = input_dim - 1;
-        const float* embed = ( const float* )weight_data + num_output * word_index;
+        const float* embed = (const float*)weight_data + num_output * word_index;
         for (int z = 0; z < num_output; z++)
         {
             out_data[i * num_output + z] = embed[z];
@@ -82,7 +81,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct embedding_param* param = ( struct embedding_param* )node->op.param_mem;
+    struct embedding_param* param = (struct embedding_param*)node->op.param_mem;
 
     struct tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
     struct tensor* bias_tensor = NULL;
@@ -91,9 +90,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);
     }
 
-    return ref_embed_fp32((float*)input->data, (float*)output->data, (float*)weight_tensor->data, 
-        bias_tensor ? (float*)bias_tensor->data : NULL, param->input_dim, param->num_output, 
-        input->elem_size, param->bias_term, 1.0f, 0.0f);
+    return ref_embed_fp32((float*)input->data, (float*)output->data, (float*)weight_tensor->data,
+                          bias_tensor ? (float*)bias_tensor->data : NULL, param->input_dim, param->num_output,
+                          input->elem_size, param->bias_term, 1.0f, 0.0f);
 }
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
diff --git a/source/device/cpu/op/expand/expand_ref.c b/source/device/cpu/op/expand/expand_ref.c
index e92b30d99..fc0bdcfe4 100644
--- a/source/device/cpu/op/expand/expand_ref.c
+++ b/source/device/cpu/op/expand/expand_ref.c
@@ -36,7 +36,6 @@
 #include <string.h>
 #include <limits.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -47,7 +46,8 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
     return 0;
 }
 
-int ref_expand_fp32(float* in1_data, float* in2_data, float* out_data, int* in1_dims, int* in2_dims){
+int ref_expand_fp32(float* in1_data, float* in2_data, float* out_data, int* in1_dims, int* in2_dims)
+{
     int i_n = in1_dims[0] == 0 ? 1 : in1_dims[0];
     int i_c = in1_dims[1] == 0 ? 1 : in1_dims[1];
     int i_h = in1_dims[2] == 0 ? 1 : in1_dims[2];
@@ -59,26 +59,26 @@ int ref_expand_fp32(float* in1_data, float* in2_data, float* out_data, int* in1_
     int o_w = in2_dims[3] == 0 ? 1 : in2_dims[3];
 
     int int_max = INT_MAX;
-    if(i_n > int_max / i_c || i_h > int_max /(i_n*i_c) || i_w > int_max / (i_n * i_c * i_h))
+    if (i_n > int_max / i_c || i_h > int_max / (i_n * i_c) || i_w > int_max / (i_n * i_c * i_h))
     {
         TLOG_INFO("input dims overflow!");
         return -1;
     }
-    if(o_n > int_max /o_c || o_h > int_max/(o_n*o_c)||o_w > int_max/(o_n*o_c*o_h))
+    if (o_n > int_max / o_c || o_h > int_max / (o_n * o_c) || o_w > int_max / (o_n * o_c * o_h))
     {
         TLOG_INFO("output dims overflow!");
         return -1;
     }
-    
+
     int index = 0;
     int i_index = 0;
-    if( 1 == i_n && 1 == i_h && 1 == i_w && 1 == o_n && i_c == o_c)
+    if (1 == i_n && 1 == i_h && 1 == i_w && 1 == o_n && i_c == o_c)
     {
-        for(int n = 0; n < o_n; ++n)
+        for (int n = 0; n < o_n; ++n)
         {
-            for(int c = 0; c < o_c ; c++)
+            for (int c = 0; c < o_c; c++)
             {
-                for(int i = 0; i < o_h*o_w; i++)
+                for (int i = 0; i < o_h * o_w; i++)
                 {
                     out_data[index++] = in1_data[i_index];
                 }
@@ -86,23 +86,23 @@ int ref_expand_fp32(float* in1_data, float* in2_data, float* out_data, int* in1_
             }
         }
     }
-    else 
+    else
     {
         int i_size = i_n * i_c * i_h * i_w;
         int refreshed = 0;
-        for(int n = 0; n < o_n; n++)
+        for (int n = 0; n < o_n; n++)
         {
-            for(int c = 0; c < o_c; c++)
+            for (int c = 0; c < o_c; c++)
             {
-                for(int h = 0; h < o_h; ++h)
+                for (int h = 0; h < o_h; ++h)
                 {
-                    for(int w = 0; w < o_w; ++w)
+                    for (int w = 0; w < o_w; ++w)
                     {
                         refreshed = 0;
                         if (i_index == i_size)
                             i_index = 0;
                         out_data[index++] = in1_data[i_index];
-                        if (i_w != 1) 
+                        if (i_w != 1)
                         {
                             i_index++;
                             refreshed = 1;
@@ -136,34 +136,33 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input1_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* input2_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct expand_param* param = ( struct expand_param* )ir_node->op.param_mem;
+    struct expand_param* param = (struct expand_param*)ir_node->op.param_mem;
 
     int dim1_size = input1_tensor->dim_num;
     int dim2_size = input2_tensor->dim_num;
 
-
-    int* input1_dims = (int*)malloc(sizeof(int)*4);
-    int* input2_dims = (int*)malloc(sizeof(int)*4);
-    for(int i = 0; i < 4; i++)
+    int* input1_dims = (int*)malloc(sizeof(int) * 4);
+    int* input2_dims = (int*)malloc(sizeof(int) * 4);
+    for (int i = 0; i < 4; i++)
     {
         input1_dims[i] = 0;
     }
 
-    for(int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++)
     {
         input2_dims[i] = 0;
     }
 
-    for(int i = 0; i < dim1_size ; i++)
+    for (int i = 0; i < dim1_size; i++)
     {
         input1_dims[i] = input1_tensor->dims[i];
     }
-    for(int i = 0; i < param->dim_num; i++)
+    for (int i = 0; i < param->dim_num; i++)
     {
         input2_dims[i] = param->ex_shape[i];
     }
-    ref_expand_fp32((float*)input1_tensor->data, (float*)input2_tensor->data, 
-        (float*)output_tensor->data, input1_dims, input2_dims);
+    ref_expand_fp32((float*)input1_tensor->data, (float*)input2_tensor->data,
+                    (float*)output_tensor->data, input1_dims, input2_dims);
 
     free(input1_dims);
     free(input2_dims);
@@ -177,12 +176,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops expand_node_ops = {.prerun = NULL,
-                                           .run = run,
-                                           .reshape = NULL,
-                                           .postrun = NULL,
-                                           .init_node = init_node,
-                                           .release_node = release_node,
-                                           .score = score};
+                                          .run = run,
+                                          .reshape = NULL,
+                                          .postrun = NULL,
+                                          .init_node = init_node,
+                                          .release_node = release_node,
+                                          .score = score};
 
 int register_expand_ref_op()
 {
diff --git a/source/device/cpu/op/expanddims/expanddims_ref.c b/source/device/cpu/op/expanddims/expanddims_ref.c
index ae2c31a93..7cd37a4dd 100644
--- a/source/device/cpu/op/expanddims/expanddims_ref.c
+++ b/source/device/cpu/op/expanddims/expanddims_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
diff --git a/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.c b/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.c
index b472c6fcc..28cf324c7 100644
--- a/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.c
+++ b/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.c
@@ -29,7 +29,6 @@
 #include <math.h>
 #include <arm_neon.h>
 
-
 void hgemv_1x8_a55(__fp16* biases, __fp16* input, __fp16* kernel, long kernel_size, __fp16* output);
 void hgemv_1x2_a55(__fp16* biases, __fp16* input, __fp16* kernel, long kernel_size, __fp16* output);
 
@@ -41,12 +40,12 @@ void hgemv1x8(const __fp16* input, const __fp16* output, __fp16* weight_interlea
     __fp16 *cur_kernel, *cur_biases, *cur_result;
 
     // #pragma omp parallel for num_threads(num_thread)
-    for(ch = start_channel; ch < end_channel; ch += 8)
+    for (ch = start_channel; ch < end_channel; ch += 8)
     {
-        cur_kernel = ( __fp16* )(weight_interleaved + kernel_size * ch);
-        cur_result = ( __fp16* )(output + ch);
-        cur_biases = biases ? ( __fp16* )(biases + ch) : NULL;
-        hgemv_1x8_a55(cur_biases, ( __fp16* )input, cur_kernel, kernel_size, cur_result); // todo implement with A76
+        cur_kernel = (__fp16*)(weight_interleaved + kernel_size * ch);
+        cur_result = (__fp16*)(output + ch);
+        cur_biases = biases ? (__fp16*)(biases + ch) : NULL;
+        hgemv_1x8_a55(cur_biases, (__fp16*)input, cur_kernel, kernel_size, cur_result); // todo implement with A76
     }
 }
 
@@ -58,26 +57,25 @@ void hgemv1x2(const __fp16* input, const __fp16* output, __fp16* weight_interlea
     int ch = 0;
     __fp16 *cur_kernel, *cur_biases, *cur_result;
 
-    for(ch = start_channel; ch < (end_channel & -2); ch += 2)
+    for (ch = start_channel; ch < (end_channel & -2); ch += 2)
     {
-        cur_kernel = ( __fp16* )(weight_interleaved + kernel_size * ch);
-        cur_result = ( __fp16* )(output + ch);
-        cur_biases = biases ? ( __fp16* )(biases + ch) : NULL;
-        hgemv_1x2_a55(cur_biases, ( __fp16* )input, cur_kernel, kernel_size, cur_result);
+        cur_kernel = (__fp16*)(weight_interleaved + kernel_size * ch);
+        cur_result = (__fp16*)(output + ch);
+        cur_biases = biases ? (__fp16*)(biases + ch) : NULL;
+        hgemv_1x2_a55(cur_biases, (__fp16*)input, cur_kernel, kernel_size, cur_result);
     }
 
-    if(end_channel & 0x1)
+    if (end_channel & 0x1)
     {
-        cur_kernel = ( __fp16* )(weight_interleaved + kernel_size * ch);
-        cur_result = ( __fp16* )(output + ch);
+        cur_kernel = (__fp16*)(weight_interleaved + kernel_size * ch);
+        cur_result = (__fp16*)(output + ch);
         sum = biases ? *(biases + ch) : 0.f;
-        for(int j = 0; j < kernel_size; j++)
+        for (int j = 0; j < kernel_size; j++)
             sum = sum + input[j] * cur_kernel[j];
         *cur_result = sum;
     }
 }
 
-
 static void interleave_kernel(const __fp16* kernel, __fp16* kernel_interleaved, int out_chan, int kernel_size)
 {
     int i, j, k;
@@ -85,46 +83,45 @@ static void interleave_kernel(const __fp16* kernel, __fp16* kernel_interleaved,
     __fp16* cur_kernel_interleaved;
 
     // interleave 8 kernel
-    for(i = 0; i < (out_chan & -8); i += 8)
+    for (i = 0; i < (out_chan & -8); i += 8)
     {
-        for(j = 0; j < 8; j++)
-            cur_kernel[j] = ( __fp16* )kernel + kernel_size * (i + j);
-        cur_kernel_interleaved = ( __fp16* )kernel_interleaved + kernel_size * i;
-        for(k = 0; k < kernel_size; k++)
-            for(j = 0; j < 8; j++)
+        for (j = 0; j < 8; j++)
+            cur_kernel[j] = (__fp16*)kernel + kernel_size * (i + j);
+        cur_kernel_interleaved = (__fp16*)kernel_interleaved + kernel_size * i;
+        for (k = 0; k < kernel_size; k++)
+            for (j = 0; j < 8; j++)
                 cur_kernel_interleaved[8 * k + j] = *(cur_kernel[j] + k);
     }
 
     // interleave 2 kernel
-    for(; i < (out_chan & -2); i += 2)
+    for (; i < (out_chan & -2); i += 2)
     {
-        for(j = 0; j < 2; j++)
-            cur_kernel[j] = ( __fp16* )kernel + kernel_size * (i + j);
-        cur_kernel_interleaved = ( __fp16* )kernel_interleaved + kernel_size * i;
-        for(k = 0; k < kernel_size; k++)
-            for(j = 0; j < 2; j++)
+        for (j = 0; j < 2; j++)
+            cur_kernel[j] = (__fp16*)kernel + kernel_size * (i + j);
+        cur_kernel_interleaved = (__fp16*)kernel_interleaved + kernel_size * i;
+        for (k = 0; k < kernel_size; k++)
+            for (j = 0; j < 2; j++)
                 cur_kernel_interleaved[2 * k + j] = *(cur_kernel[j] + k);
     }
 
     // copy last kernel
-    if(out_chan & 0x1)
+    if (out_chan & 0x1)
     {
-        cur_kernel[0] = ( __fp16* )kernel + kernel_size * i;
-        cur_kernel_interleaved = ( __fp16* )kernel_interleaved + kernel_size * i;
-        for(k = 0; k < kernel_size; k++)
+        cur_kernel[0] = (__fp16*)kernel + kernel_size * i;
+        cur_kernel_interleaved = (__fp16*)kernel_interleaved + kernel_size * i;
+        for (k = 0; k < kernel_size; k++)
             cur_kernel_interleaved[k] = *(cur_kernel[0] + k);
     }
 
     return;
 }
 
-int fp16_fc_kernel_prerun(struct tensor*  input_tensor , \
-                    struct tensor*  filter_tensor ,  \
-                    struct tensor*  output_tensor , \
-                    struct fc_priv_info*  priv_info , \
-                    struct fc_param* param)
+int fp16_fc_kernel_prerun(struct tensor* input_tensor,
+                          struct tensor* filter_tensor,
+                          struct tensor* output_tensor,
+                          struct fc_priv_info* priv_info,
+                          struct fc_param* param)
 {
-
     int num_output = param->num_output;
     int kernel_size = filter_tensor->dims[1];
     int kernel_align = ((kernel_size + 1) & -2);
@@ -151,14 +148,13 @@ int fp16_fc_kernel_prerun(struct tensor*  input_tensor , \
     return 0;
 }
 
-
-int fp16_fc_kernel_run(struct tensor* input_tensor , \
-                    struct tensor* filter_tensor , \
-                    struct tensor* bias_tensor ,  \
-                    struct tensor* output_tensor , \
-                    struct fc_priv_info* priv_info , \
-                    struct fc_param* param, \
-                    int num_thread, int cpu_affinity)
+int fp16_fc_kernel_run(struct tensor* input_tensor,
+                       struct tensor* filter_tensor,
+                       struct tensor* bias_tensor,
+                       struct tensor* output_tensor,
+                       struct fc_priv_info* priv_info,
+                       struct fc_param* param,
+                       int num_thread, int cpu_affinity)
 {
     int out_num = param->num_output;
     int kernel_size = filter_tensor->dims[1];
@@ -172,16 +168,15 @@ int fp16_fc_kernel_run(struct tensor* input_tensor , \
 
     int out_num_8 = out_num & ~7;
 
-    for(int i = 0; i < input_tensor->dims[0]; i++)
+    for (int i = 0; i < input_tensor->dims[0]; i++)
     {
         __fp16* cur_input = input + i * kernel_size;
         __fp16* cur_output = output + i * out_num;
 
         hgemv1x8(cur_input, cur_output, weight, biases, kernel_size, 0, out_num_8, num_thread, cpu_affinity);
-        if(out_num & 0x7)
+        if (out_num & 0x7)
             hgemv1x2(cur_input, cur_output, weight, biases, kernel_size, out_num_8, out_num, num_thread, cpu_affinity);
     }
 
-    return 0 ;
-
+    return 0;
 }
diff --git a/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.h b/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.h
index 50b84a2ae..b3dfd0c13 100644
--- a/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.h
+++ b/source/device/cpu/op/fc/cortex-a/armv8.2/fc_kernel_fp16_arm82.h
@@ -27,20 +27,18 @@
 
 #include "../fc_kernel_arm.h"
 
-
-int fp16_fc_kernel_prerun(struct tensor*  input_tensor , \
-                    struct tensor*  filter_tensor ,  \
-                    struct tensor*  output_tensor , \
-                    struct fc_priv_info* priv_info ,      \
-                    struct fc_param* param) ;
-
-int fp16_fc_kernel_run(struct tensor* input_tensor , \
-                 struct tensor* filter_tensor ,\
-                 struct tensor* bias_tensor ,  \
-                 struct tensor* output_tensor , \
-                 struct fc_priv_info*  priv_info , \
-                 struct fc_param* param, \
-                 int num_thread, int cpu_affinity) ;
-
+int fp16_fc_kernel_prerun(struct tensor* input_tensor,
+                          struct tensor* filter_tensor,
+                          struct tensor* output_tensor,
+                          struct fc_priv_info* priv_info,
+                          struct fc_param* param);
+
+int fp16_fc_kernel_run(struct tensor* input_tensor,
+                       struct tensor* filter_tensor,
+                       struct tensor* bias_tensor,
+                       struct tensor* output_tensor,
+                       struct fc_priv_info* priv_info,
+                       struct fc_param* param,
+                       int num_thread, int cpu_affinity);
 
 #endif
diff --git a/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c b/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c
index 2a7364106..d9322b864 100644
--- a/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c
+++ b/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c
@@ -41,12 +41,10 @@
 
 #include <string.h>
 
-
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "cortex_a/fc_kernel_fp16_arm82.h"
 #endif
 
-
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -55,8 +53,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct fc_priv_info* priv_info = ( struct fc_priv_info* )exec_node->ops_priv;
-    struct fc_param* fc_param = ( struct fc_param* )ir_node->op.param_mem;
+    struct fc_priv_info* priv_info = (struct fc_priv_info*)exec_node->ops_priv;
+    struct fc_param* fc_param = (struct fc_param*)ir_node->op.param_mem;
 
     /* fp32 prerun */
     if (exec_graph->mode == TENGINE_MODE_FP32)
@@ -71,7 +69,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     else if (exec_graph->mode == TENGINE_MODE_FP16)
     {
-        if(fp16_fc_kernel_prerun(input_tensor, filter_tensor, output_tensor, priv_info, fc_param) < 0)
+        if (fp16_fc_kernel_prerun(input_tensor, filter_tensor, output_tensor, priv_info, fc_param) < 0)
         {
             TLOG_ERR("hcl fp16 fc prerun failed\n");
             // set_tengine_errno(EFAULT);
@@ -79,14 +77,14 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
         }
     }
 #endif
-    else if(exec_graph->mode == TENGINE_MODE_INT8)
-	{
+    else if (exec_graph->mode == TENGINE_MODE_INT8)
+    {
         if (int8_fc_kernel_prerun(input_tensor, filter_tensor, output_tensor, priv_info, fc_param) < 0)
         {
             TLOG_ERR("hcl fc prerun failed\n");
             return -1;
         }
-    }	
+    }
     else
     {
         TLOG_ERR("Tengine work node not support %d\n", exec_graph->mode);
@@ -114,14 +112,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct fc_param* fc_param = ( struct fc_param* )ir_node->op.param_mem;
-    struct fc_priv_info* priv_info = ( struct fc_priv_info* )exec_node->ops_priv;
+    struct fc_param* fc_param = (struct fc_param*)ir_node->op.param_mem;
+    struct fc_priv_info* priv_info = (struct fc_priv_info*)exec_node->ops_priv;
 
     /* fp32 run */
     if (exec_graph->mode == TENGINE_MODE_FP32)
     {
         if (fc_kernel_run(input_tensor, weight_tensor, bias_tensor, output_tensor, priv_info, fc_param, num_thread,
-                        cpu_affinity) < 0)
+                          cpu_affinity)
+            < 0)
         {
             TLOG_ERR("hcl fc run failed\n");
             return -1;
@@ -139,14 +138,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         }
     }
 #endif
-    else if (exec_graph->mode == TENGINE_MODE_INT8) 
-	{
-        if (int8_fc_kernel_run(input_tensor, weight_tensor, bias_tensor, output_tensor, priv_info,fc_param,num_thread,cpu_affinity) < 0)
+    else if (exec_graph->mode == TENGINE_MODE_INT8)
+    {
+        if (int8_fc_kernel_run(input_tensor, weight_tensor, bias_tensor, output_tensor, priv_info, fc_param, num_thread, cpu_affinity) < 0)
         {
             TLOG_ERR("hcl fc run failed\n");
             return -1;
         }
-    }	
+    }
     else
     {
         TLOG_ERR("Tengine work node not support %d\n", exec_graph->mode);
@@ -229,7 +228,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct fc_priv_info* priv_info = ( struct fc_priv_info* )exec_node->ops_priv;
+    struct fc_priv_info* priv_info = (struct fc_priv_info*)exec_node->ops_priv;
 
     /* fp32 postrun */
     if (exec_graph->mode == TENGINE_MODE_FP32 || exec_graph->mode == TENGINE_MODE_UINT8)
@@ -251,7 +250,7 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     /* init the private info data of convolution op */
-    struct fc_priv_info* priv_info = ( struct fc_priv_info* )sys_malloc(sizeof(struct fc_priv_info));
+    struct fc_priv_info* priv_info = (struct fc_priv_info*)sys_malloc(sizeof(struct fc_priv_info));
     if (priv_info == NULL)
     {
         return -1;
@@ -264,7 +263,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
 
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct fc_priv_info* priv_info = ( struct fc_priv_info* )exec_node->ops_priv;
+    struct fc_priv_info* priv_info = (struct fc_priv_info*)exec_node->ops_priv;
     sys_free(priv_info);
     exec_node->ops_priv = NULL;
 
@@ -283,7 +282,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
         return 0;
 #else
     if (input_tensor->data_type != TENGINE_DT_FP32
-    // && input_tensor->data_type != TENGINE_DT_INT8    // 从tengine移植的 fc int8 arm 与 fc int8 ref 相比相差较大，暂且关闭
+        // && input_tensor->data_type != TENGINE_DT_INT8    // 从tengine移植的 fc int8 arm 与 fc int8 ref 相比相差较大，暂且关闭
     )
         return 0;
 #endif
@@ -297,8 +296,7 @@ static struct node_ops hcl_node_ops = {.prerun = prerun,
                                        .postrun = postrun,
                                        .init_node = init_node,
                                        .release_node = release_node,
-                                       .score = score
-};
+                                       .score = score};
 
 int register_fc_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.c b/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.c
index 15c46faee..42ea8ca63 100644
--- a/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.c
+++ b/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.c
@@ -32,7 +32,6 @@
 #include <math.h>
 #include <arm_neon.h>
 
-
 #ifdef __aarch64__
 void sgemv_1x8_a72(float* biases, float* input, float* kernel, long kernel_size, float* output);
 void sgemv_1x2_a72(float* biases, float* input, float* kernel, long kernel_size, float* output);
@@ -96,8 +95,8 @@ static void interleave_kernel(const float* kernel, float* kernel_interleaved, in
     for (i = 0; i < (out_chan & -8); i += 8)
     {
         for (j = 0; j < 8; j++)
-            cur_kernel[j] = ( float* )kernel + kernel_size * (i + j);
-        cur_kernel_interleaved = ( float* )kernel_interleaved + kernel_size * i;
+            cur_kernel[j] = (float*)kernel + kernel_size * (i + j);
+        cur_kernel_interleaved = (float*)kernel_interleaved + kernel_size * i;
         for (k = 0; k < kernel_size; k++)
             for (j = 0; j < 8; j++)
                 cur_kernel_interleaved[8 * k + j] = *(cur_kernel[j] + k);
@@ -107,8 +106,8 @@ static void interleave_kernel(const float* kernel, float* kernel_interleaved, in
     for (; i < (out_chan & -2); i += 2)
     {
         for (j = 0; j < 2; j++)
-            cur_kernel[j] = ( float* )kernel + kernel_size * (i + j);
-        cur_kernel_interleaved = ( float* )kernel_interleaved + kernel_size * i;
+            cur_kernel[j] = (float*)kernel + kernel_size * (i + j);
+        cur_kernel_interleaved = (float*)kernel_interleaved + kernel_size * i;
         for (k = 0; k < kernel_size; k++)
             for (j = 0; j < 2; j++)
                 cur_kernel_interleaved[2 * k + j] = *(cur_kernel[j] + k);
@@ -117,8 +116,8 @@ static void interleave_kernel(const float* kernel, float* kernel_interleaved, in
     // copy last kernel
     if (out_chan & 0x1)
     {
-        cur_kernel[0] = ( float* )kernel + kernel_size * i;
-        cur_kernel_interleaved = ( float* )kernel_interleaved + kernel_size * i;
+        cur_kernel[0] = (float*)kernel + kernel_size * i;
+        cur_kernel_interleaved = (float*)kernel_interleaved + kernel_size * i;
         for (k = 0; k < kernel_size; k++)
             cur_kernel_interleaved[k] = *(cur_kernel[0] + k);
     }
@@ -127,7 +126,7 @@ static void interleave_kernel(const float* kernel, float* kernel_interleaved, in
 int fc_kernel_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
                      struct fc_priv_info* priv_info, struct fc_param* param)
 {
-    int num_output  = param->num_output;
+    int num_output = param->num_output;
     int kernel_size = filter_tensor->dims[1];
 
     if (!priv_info->interleave_buffer)
@@ -139,8 +138,8 @@ int fc_kernel_prerun(struct tensor* input_tensor, struct tensor* filter_tensor,
         priv_info->interleave_buffer_size = mem_size;
     }
 
-    float* filter_data = ( float* )filter_tensor->data;
-    interleave_kernel(filter_data, ( float* )priv_info->interleave_buffer, num_output, kernel_size);
+    float* filter_data = (float*)filter_tensor->data;
+    interleave_kernel(filter_data, (float*)priv_info->interleave_buffer, num_output, kernel_size);
 
     return 0;
 }
@@ -170,7 +169,7 @@ int fc_kernel_run(struct tensor* input_tensor, struct tensor* filter_tensor, str
     int out_num = param->num_output;
     int kernel_size = filter_tensor->dims[1];
 
-    float* input  = input_tensor->data;
+    float* input = input_tensor->data;
     float* output = output_tensor->data;
     float* biases = NULL;
     if (bias_tensor)
diff --git a/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.h b/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.h
index 5b6e7e835..2ff456372 100644
--- a/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.h
+++ b/source/device/cpu/op/fc/cortex-a/fc_kernel_arm.h
@@ -31,7 +31,6 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 struct fc_priv_info
 {
     void* interleave_buffer;
diff --git a/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.c b/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.c
index af6fcb7e9..25143bfe7 100644
--- a/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.c
+++ b/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.c
@@ -31,19 +31,19 @@
 
 #include <arm_neon.h>
 
-
-void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *kernel, long kernel_size,
-              int8_t *output) {
+void gemv_1x8_int8(int32_t* biases, const float* scales, int8_t* inp, int8_t* kernel, long kernel_size,
+                   int8_t* output)
+{
     int8x8_t input;
     int8x16_t weight_0_1, weight_2_3, weight_4_5, weight_6_7;
     int16x8_t weight0_16, weight1_16, weight2_16, weight3_16;
     int16x8_t weight4_16, weight5_16, weight6_16, weight7_16;
     int32x4_t res = {0, 0, 0, 0};
     int32x4_t res1 = {0, 0, 0, 0};
-    int8_t *input_ptr = inp;
-    int8_t *weight_ptr = kernel;
+    int8_t* input_ptr = inp;
+    int8_t* weight_ptr = kernel;
     int remainw = (kernel_size >> 3) << 3;
-    for (int i = 0; i < remainw; i = i + 8) 
+    for (int i = 0; i < remainw; i = i + 8)
     {
         input = vld1_s8(input_ptr);
         weight_0_1 = vld1q_s8(weight_ptr);
@@ -74,7 +74,7 @@ void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *ke
         weight_ptr += 64;
     }
 
-    for (int i = remainw; i < kernel_size; ++i) 
+    for (int i = remainw; i < kernel_size; ++i)
     {
         weight0_16 = vmull_s8(vdup_n_s8(input_ptr[0]), vld1_s8(weight_ptr));
         res = vaddq_s32(vmovl_s16(vget_low_s16(weight0_16)), res);
@@ -83,12 +83,12 @@ void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *ke
         weight_ptr += 8;
     }
 
-    if (biases) 
+    if (biases)
     {
         int32x4_t bias = vld1q_s32(biases);
         int32x4_t bias1 = vld1q_s32(biases + 4);
-        res = vaddq_s32(res,bias);
-        res1 = vaddq_s32(res1,bias1);
+        res = vaddq_s32(res, bias);
+        res1 = vaddq_s32(res1, bias1);
     }
 
     float32x4_t res_f = vcvtq_f32_s32(res);
@@ -99,8 +99,8 @@ void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *ke
 
     res_f = vmulq_f32(res_f, scale);
     res1_f = vmulq_f32(res1_f, scale_1);
-    res_f = vaddq_f32(res_f,vdupq_n_f32(0.5f));
-    res1_f = vaddq_f32(res1_f,vdupq_n_f32(0.5f));
+    res_f = vaddq_f32(res_f, vdupq_n_f32(0.5f));
+    res1_f = vaddq_f32(res1_f, vdupq_n_f32(0.5f));
 
     res = vcvtq_s32_f32(res_f);
     res1 = vcvtq_s32_f32(res1_f);
@@ -114,18 +114,18 @@ void gemv_1x8_int8(int32_t *biases, const float *scales, int8_t *inp, int8_t *ke
     vst1_s8(output, result);
 }
 
-void gemv_1x2_int8(const int32_t *biases, const float *scales, int8_t *inp, int8_t *kernel, long kernel_size,
-              int8_t *output) 
+void gemv_1x2_int8(const int32_t* biases, const float* scales, int8_t* inp, int8_t* kernel, long kernel_size,
+                   int8_t* output)
 {
-    int8_t *input_ptr = inp;
-    int8_t *weight_ptr = kernel;
+    int8_t* input_ptr = inp;
+    int8_t* weight_ptr = kernel;
     int remainw = (kernel_size << 3) >> 3;
     int8x8x2_t weight;
     int8x8_t input;
     int16x8_t out_16_0, out_16_1;
     int32x4_t out_32_0, out_32_1;
     int32_t sum0 = 0, sum1 = 0;
-    for (int i = 0; i < remainw; i = i + 8) 
+    for (int i = 0; i < remainw; i = i + 8)
     {
         weight = vld2_s8(weight_ptr);
         input = vld1_s8(input_ptr);
@@ -133,15 +133,13 @@ void gemv_1x2_int8(const int32_t *biases, const float *scales, int8_t *inp, int8
         out_16_1 = vmull_s8(weight.val[1], input);
         out_32_0 = vpaddlq_s16(out_16_0);
         out_32_1 = vpaddlq_s16(out_16_1);
-        sum0 += vgetq_lane_s32(out_32_0, 0) + vgetq_lane_s32(out_32_0, 1) +
-                vgetq_lane_s32(out_32_0, 2) + vgetq_lane_s32(out_32_0, 3);
-        sum1 += vgetq_lane_s32(out_32_1, 0) + vgetq_lane_s32(out_32_1, 1) +
-                vgetq_lane_s32(out_32_1, 2) + vgetq_lane_s32(out_32_1, 3);
+        sum0 += vgetq_lane_s32(out_32_0, 0) + vgetq_lane_s32(out_32_0, 1) + vgetq_lane_s32(out_32_0, 2) + vgetq_lane_s32(out_32_0, 3);
+        sum1 += vgetq_lane_s32(out_32_1, 0) + vgetq_lane_s32(out_32_1, 1) + vgetq_lane_s32(out_32_1, 2) + vgetq_lane_s32(out_32_1, 3);
         weight_ptr += 16;
         input_ptr += 8;
     }
 
-    for (int i = remainw; i < kernel_size; ++i) 
+    for (int i = remainw; i < kernel_size; ++i)
     {
         sum0 += weight_ptr[0] * input_ptr[0];
         sum1 += weight_ptr[1] * input_ptr[0];
@@ -149,7 +147,7 @@ void gemv_1x2_int8(const int32_t *biases, const float *scales, int8_t *inp, int8
         weight_ptr += 2;
     }
 
-    if (biases) 
+    if (biases)
     {
         sum0 += biases[0];
         sum1 += biases[1];
@@ -172,51 +170,53 @@ void gemv_1x2_int8(const int32_t *biases, const float *scales, int8_t *inp, int8
 }
 
 // start and end channel must be 8 aligned
-void gemv1x8(const int8_t *input, const int8_t *output, int8_t *weight_interleaved,
-             const int32_t *biases, const float *scales,
+void gemv1x8(const int8_t* input, const int8_t* output, int8_t* weight_interleaved,
+             const int32_t* biases, const float* scales,
              int kernel_size, int start_channel, int end_channel, int num_thread,
-             int cpu_affinity) 
+             int cpu_affinity)
 {
     int ch = 0;
     int8_t *cur_kernel, *cur_result;
-    int32_t *cur_biases;
-    const float *cur_scales;
+    int32_t* cur_biases;
+    const float* cur_scales;
 
     // #pragma omp parallel for num_threads(num_thread)
-    for (ch = start_channel; ch < end_channel; ch += 8) 
+    for (ch = start_channel; ch < end_channel; ch += 8)
     {
-        cur_kernel = (int8_t *) (weight_interleaved + kernel_size * ch);
-        cur_result = (int8_t *) (output + ch);
-        cur_biases = biases ? (int32_t *) (biases + ch) : NULL;
+        cur_kernel = (int8_t*)(weight_interleaved + kernel_size * ch);
+        cur_result = (int8_t*)(output + ch);
+        cur_biases = biases ? (int32_t*)(biases + ch) : NULL;
         cur_scales = scales + ch;
-        gemv_1x8_int8(cur_biases, cur_scales, (int8_t *) input, cur_kernel, kernel_size,
+        gemv_1x8_int8(cur_biases, cur_scales, (int8_t*)input, cur_kernel, kernel_size,
                       cur_result);
     }
 }
 
 // start channel must be 2 aligned
-void gemv1x2(const int8_t *input, int8_t *output, int8_t *weight_interleaved,
-             const int32_t *biases, const float *scales,
-             int kernel_size,int start_channel,int end_channel,int num_thread,int cpu_affinity)
+void gemv1x2(const int8_t* input, int8_t* output, int8_t* weight_interleaved,
+             const int32_t* biases, const float* scales,
+             int kernel_size, int start_channel, int end_channel, int num_thread, int cpu_affinity)
 {
     int32_t sum;
     int ch = 0;
-    int8_t *cur_kernel;
-    int32_t *cur_biases;
-    int8_t *cur_result;
+    int8_t* cur_kernel;
+    int32_t* cur_biases;
+    int8_t* cur_result;
     const float* cur_scales;
 
-    for (ch = start_channel; ch < (end_channel & -2); ch += 2) {
-        cur_kernel = (int8_t *) (weight_interleaved + kernel_size * ch);
-        cur_result = (int8_t *) (output + ch);
-        cur_biases = biases ? (int32_t *) (biases + ch) : NULL;
+    for (ch = start_channel; ch < (end_channel & -2); ch += 2)
+    {
+        cur_kernel = (int8_t*)(weight_interleaved + kernel_size * ch);
+        cur_result = (int8_t*)(output + ch);
+        cur_biases = biases ? (int32_t*)(biases + ch) : NULL;
         cur_scales = scales + ch;
-        gemv_1x2_int8(cur_biases, cur_scales, (int8_t*) input, cur_kernel, kernel_size, cur_result);
+        gemv_1x2_int8(cur_biases, cur_scales, (int8_t*)input, cur_kernel, kernel_size, cur_result);
     }
 
-    if (end_channel & 0x1) {
-        cur_kernel = (int8_t *) (weight_interleaved + kernel_size * ch);
-        cur_result = (int8_t *) (output + ch);
+    if (end_channel & 0x1)
+    {
+        cur_kernel = (int8_t*)(weight_interleaved + kernel_size * ch);
+        cur_result = (int8_t*)(output + ch);
         sum = biases ? *(biases + ch) : 0;
         for (int j = 0; j < kernel_size; j++)
             sum = sum + input[j] * cur_kernel[j];
@@ -229,40 +229,39 @@ void gemv1x2(const int8_t *input, int8_t *output, int8_t *weight_interleaved,
     }
 }
 
-
-static void interleave_kernel(const int8_t *kernel, int8_t *kernel_interleaved, int out_chan, int kernel_size) 
+static void interleave_kernel(const int8_t* kernel, int8_t* kernel_interleaved, int out_chan, int kernel_size)
 {
     int i, j, k;
-    int8_t *cur_kernel[8];
-    int8_t *cur_kernel_interleaved;
+    int8_t* cur_kernel[8];
+    int8_t* cur_kernel_interleaved;
 
     // interleave 8 kernel
-    for (i = 0; i < (out_chan & -8); i += 8) 
+    for (i = 0; i < (out_chan & -8); i += 8)
     {
         for (j = 0; j < 8; j++)
-            cur_kernel[j] = (int8_t *) kernel + kernel_size * (i + j);
-        cur_kernel_interleaved = (int8_t *) kernel_interleaved + kernel_size * i;
+            cur_kernel[j] = (int8_t*)kernel + kernel_size * (i + j);
+        cur_kernel_interleaved = (int8_t*)kernel_interleaved + kernel_size * i;
         for (k = 0; k < kernel_size; k++)
             for (j = 0; j < 8; j++)
                 cur_kernel_interleaved[8 * k + j] = *(cur_kernel[j] + k);
     }
 
     // interleave 2 kernel
-    for (; i < (out_chan & -2); i += 2) 
+    for (; i < (out_chan & -2); i += 2)
     {
         for (j = 0; j < 2; j++)
-            cur_kernel[j] = (int8_t *) kernel + kernel_size * (i + j);
-        cur_kernel_interleaved = (int8_t *) kernel_interleaved + kernel_size * i;
+            cur_kernel[j] = (int8_t*)kernel + kernel_size * (i + j);
+        cur_kernel_interleaved = (int8_t*)kernel_interleaved + kernel_size * i;
         for (k = 0; k < kernel_size; k++)
             for (j = 0; j < 2; j++)
                 cur_kernel_interleaved[2 * k + j] = *(cur_kernel[j] + k);
     }
 
     // copy last kernel
-    if (out_chan & 0x1) 
+    if (out_chan & 0x1)
     {
-        cur_kernel[0] = (int8_t *) kernel + kernel_size * i;
-        cur_kernel_interleaved = (int8_t *) kernel_interleaved + kernel_size * i;
+        cur_kernel[0] = (int8_t*)kernel + kernel_size * i;
+        cur_kernel_interleaved = (int8_t*)kernel_interleaved + kernel_size * i;
         for (k = 0; k < kernel_size; k++)
             cur_kernel_interleaved[k] = *(cur_kernel[0] + k);
     }
@@ -270,75 +269,75 @@ static void interleave_kernel(const int8_t *kernel, int8_t *kernel_interleaved,
     return;
 }
 
-int int8_fc_kernel_prerun(struct tensor *input_tensor, \
-                    struct tensor *filter_tensor, \
-                    struct tensor *output_tensor, \
-                    struct fc_priv_info *priv_info, \
-                    struct fc_param *param) 
+int int8_fc_kernel_prerun(struct tensor* input_tensor,
+                          struct tensor* filter_tensor,
+                          struct tensor* output_tensor,
+                          struct fc_priv_info* priv_info,
+                          struct fc_param* param)
 {
-
     int num_output = param->num_output;
     int kernel_size = filter_tensor->dims[1];
     int kernel_align = ((kernel_size + 1) & -2);
 
-    if (!priv_info->interleave_buffer) 
+    if (!priv_info->interleave_buffer)
     {
         int mem_size = num_output * kernel_align;
-        void *mem = sys_malloc(mem_size);
+        void* mem = sys_malloc(mem_size);
         priv_info->interleave_buffer = mem;
         priv_info->interleave_buffer_size = mem_size;
     }
-    if (!priv_info->input_buffer) 
+    if (!priv_info->input_buffer)
     {
         int mem_size = kernel_align;
-        void *mem = sys_malloc(mem_size);
+        void* mem = sys_malloc(mem_size);
         priv_info->input_buffer = mem;
         priv_info->input_buffer_size = mem_size;
     }
 
-    int8_t *filter_data = (int8_t *) filter_tensor->data;
+    int8_t* filter_data = (int8_t*)filter_tensor->data;
 
-    interleave_kernel(filter_data, (int8_t *) priv_info->interleave_buffer, num_output,
+    interleave_kernel(filter_data, (int8_t*)priv_info->interleave_buffer, num_output,
                       kernel_size);
 
     return 0;
 }
 
-int int8_fc_kernel_run(struct tensor *input_tensor, \
-                    struct tensor *filter_tensor, \
-                    struct tensor *bias_tensor, \
-                    struct tensor *output_tensor, \
-                    struct fc_priv_info *priv_info, \
-                    struct fc_param *param, \
-                    int num_thread, int cpu_affinity) {
+int int8_fc_kernel_run(struct tensor* input_tensor,
+                       struct tensor* filter_tensor,
+                       struct tensor* bias_tensor,
+                       struct tensor* output_tensor,
+                       struct fc_priv_info* priv_info,
+                       struct fc_param* param,
+                       int num_thread, int cpu_affinity)
+{
     int out_num = param->num_output;
     int kernel_size = filter_tensor->dims[1];
 
-    int8_t *input = (int8_t *) input_tensor->data;
-    int8_t *output = (int8_t *) output_tensor->data;
-    int8_t *weight = (int8_t *) priv_info->interleave_buffer;
-    int32_t *biases = NULL;
+    int8_t* input = (int8_t*)input_tensor->data;
+    int8_t* output = (int8_t*)output_tensor->data;
+    int8_t* weight = (int8_t*)priv_info->interleave_buffer;
+    int32_t* biases = NULL;
     if (bias_tensor)
-        biases = (int32_t *) bias_tensor->data;
+        biases = (int32_t*)bias_tensor->data;
 
     float input_scale = input_tensor->scale;
     float output_scale = output_tensor->scale;
-    float *weight_scales = filter_tensor->scale_list;
-    float *requant_scales = (float *) malloc(out_num * sizeof(float));
+    float* weight_scales = filter_tensor->scale_list;
+    float* requant_scales = (float*)malloc(out_num * sizeof(float));
 
     for (int i = 0; i < out_num; i++)
         requant_scales[i] = (input_scale * weight_scales[i]) / output_scale;
 
     int out_num_8 = out_num & ~7;
 
-    for (int i = 0; i < input_tensor->dims[0]; i++) 
+    for (int i = 0; i < input_tensor->dims[0]; i++)
     {
-        int8_t *cur_input = input + i * kernel_size;
-        int8_t *cur_output = output + i * out_num;
+        int8_t* cur_input = input + i * kernel_size;
+        int8_t* cur_output = output + i * out_num;
 
         gemv1x8(cur_input, cur_output, weight, biases, requant_scales, kernel_size, 0, out_num_8, num_thread, cpu_affinity);
         if (out_num & 0x7)
-            gemv1x2(cur_input, cur_output, weight, biases, requant_scales, kernel_size, out_num_8,out_num,num_thread, cpu_affinity);
+            gemv1x2(cur_input, cur_output, weight, biases, requant_scales, kernel_size, out_num_8, out_num, num_thread, cpu_affinity);
     }
 
     return 0;
diff --git a/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.h b/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.h
index 89011b983..f5d601cff 100644
--- a/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.h
+++ b/source/device/cpu/op/fc/cortex-a/fc_kernel_int8_arm.h
@@ -22,20 +22,18 @@
 
 #include "fc_kernel_arm.h"
 
+int int8_fc_kernel_prerun(struct tensor* input_tensor,
+                          struct tensor* filter_tensor,
+                          struct tensor* output_tensor,
+                          struct fc_priv_info* priv_info,
+                          struct fc_param* param);
 
-int int8_fc_kernel_prerun(struct tensor*  input_tensor , \
-                    struct tensor*  filter_tensor ,  \
-                    struct tensor*  output_tensor , \
-                    struct fc_priv_info* priv_info ,      \
-                    struct fc_param* param) ;
-
-int int8_fc_kernel_run(struct tensor* input_tensor , \
-                 struct tensor* filter_tensor ,\
-                 struct tensor* bias_tensor ,  \
-                 struct tensor* output_tensor , \
-                 struct fc_priv_info*  priv_info , \
-                 struct fc_param* param, \
-                 int num_thread, int cpu_affinity) ;
-
+int int8_fc_kernel_run(struct tensor* input_tensor,
+                       struct tensor* filter_tensor,
+                       struct tensor* bias_tensor,
+                       struct tensor* output_tensor,
+                       struct fc_priv_info* priv_info,
+                       struct fc_param* param,
+                       int num_thread, int cpu_affinity);
 
 #endif
diff --git a/source/device/cpu/op/fc/cortex-m/fc_cmsis.c b/source/device/cpu/op/fc/cortex-m/fc_cmsis.c
index beeff6f29..e53be5c71 100644
--- a/source/device/cpu/op/fc/cortex-m/fc_cmsis.c
+++ b/source/device/cpu/op/fc/cortex-m/fc_cmsis.c
@@ -38,7 +38,6 @@
 #include "arm_math.h"
 #include "arm_nnfunctions.h"
 
-
 struct cmsis_param
 {
     uint16_t bias_shift;
@@ -78,7 +77,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
     int scale = ir_tensor->scale;
     out_shift = cal_shift(scale);
 
-    struct cmsis_param* param = ( struct cmsis_param* )sys_malloc(sizeof(struct cmsis_param));
+    struct cmsis_param* param = (struct cmsis_param*)sys_malloc(sizeof(struct cmsis_param));
 
     param->bias_shift = bias_shift;
     param->out_shift = out_shift;
@@ -105,7 +104,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* weight_tensor;
     struct tensor* bias_tensor = NULL;
     struct tensor* output_tensor;
-    struct cmsis_param* cmsis_param = ( struct cmsis_param* )exec_node->ops_priv;
+    struct cmsis_param* cmsis_param = (struct cmsis_param*)exec_node->ops_priv;
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
@@ -114,10 +113,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (ir_node->input_num > 2)
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 
-    int ret =
-        arm_fully_connected_q7(input_tensor->data, weight_tensor->data, weight_tensor->dims[1], weight_tensor->dims[0],
-                               cmsis_param->bias_shift, cmsis_param->out_shift, bias_tensor ? bias_tensor->data : NULL,
-                               output_tensor->data, exec_graph->shared_mem);
+    int ret = arm_fully_connected_q7(input_tensor->data, weight_tensor->data, weight_tensor->dims[1], weight_tensor->dims[0],
+                                     cmsis_param->bias_shift, cmsis_param->out_shift, bias_tensor ? bias_tensor->data : NULL,
+                                     output_tensor->data, exec_graph->shared_mem);
 
     if (ret != ARM_MATH_SUCCESS)
         return -1;
diff --git a/source/device/cpu/op/fc/fc_ref.c b/source/device/cpu/op/fc/fc_ref.c
index ee48612c5..b0da933ea 100644
--- a/source/device/cpu/op/fc/fc_ref.c
+++ b/source/device/cpu/op/fc/fc_ref.c
@@ -39,15 +39,14 @@
 #include <math.h>
 #include <string.h>
 
-
 struct fc_data
 {
     int need_trans;
-    int batch;    // N
-    int out_number;    // OUT
-    int hidden;    // hidden
+    int batch;      // N
+    int out_number; // OUT
+    int hidden;     // hidden
     int zero[3];    // input, kernel, output
-    float scale[3];    // input, kernel, output
+    float scale[3]; // input, kernel, output
 };
 
 static int ref_fc_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, struct fc_data* param)
@@ -83,12 +82,11 @@ static int ref_fc_fp32(struct tensor* input_tensor, struct tensor* output_tensor
     return 0;
 }
 
-
 static int ref_fc_fp16(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, struct fc_data* param)
 {
-    #if MACOS
+#if MACOS
 
-    #else
+#else
     int batch = param->batch;
     int hidden = param->hidden;
     int out_number = param->out_number;
@@ -116,7 +114,7 @@ static int ref_fc_fp16(struct tensor* input_tensor, struct tensor* output_tensor
             output[n * out_number + i] = fp32_to_fp16(tmp);
         }
     }
-    #endif
+#endif
     return 0;
 }
 
@@ -126,7 +124,7 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso
     int hidden = param->hidden;
     int out_number = param->out_number;
 
-    uint8_t* input  = (uint8_t*)input_tensor->data;
+    uint8_t* input = (uint8_t*)input_tensor->data;
     uint8_t* output = (uint8_t*)output_tensor->data;
     uint8_t* weight = (uint8_t*)weight_tensor->data;
 
@@ -141,7 +139,7 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso
     {
         int32_t* bias = (int32_t*)bias_tensor->data;
         float bias_scale = bias_tensor->scale;
-                  
+
         int n, i, j;
         for (n = 0; n < batch; n++)
         {
@@ -152,14 +150,14 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso
                 {
                     if (param->need_trans == 0)
                     {
-                        float input_fp32  = ((float)input[n * hidden + j] - (float)input_zero) * input_scale;
+                        float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale;
                         float weight_fp32 = ((float)weight[i * hidden + j] - (float)weight_zero) * weight_scale;
                         data += input_fp32 * weight_fp32;
                     }
                     else
                     {
-                        float input_fp32  = ((float)input[n * hidden + j] - (float)input_zero) * input_scale;
-                        float weight_fp32 = ((float)weight[i + j * out_number] - (float)weight_zero) * weight_scale;                        
+                        float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale;
+                        float weight_fp32 = ((float)weight[i + j * out_number] - (float)weight_zero) * weight_scale;
                         data += input_fp32 * weight_fp32;
                     }
                 }
@@ -173,7 +171,7 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso
         }
     }
     else
-    {       
+    {
         int n, i, j;
         for (n = 0; n < batch; n++)
         {
@@ -184,14 +182,14 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso
                 {
                     if (param->need_trans == 0)
                     {
-                        float input_fp32  = ((float)input[n * hidden + j] - (float)input_zero) * input_scale;
+                        float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale;
                         float weight_fp32 = ((float)weight[i * hidden + j] - (float)weight_zero) * weight_scale;
                         data += input_fp32 * weight_fp32;
                     }
                     else
                     {
-                        float input_fp32  = ((float)input[n * hidden + j] - (float)input_zero) * input_scale;
-                        float weight_fp32 = ((float)weight[i + j * out_number] - (float)weight_zero) * weight_scale;                        
+                        float input_fp32 = ((float)input[n * hidden + j] - (float)input_zero) * input_scale;
+                        float weight_fp32 = ((float)weight[i + j * out_number] - (float)weight_zero) * weight_scale;
                         data += input_fp32 * weight_fp32;
                     }
                 }
@@ -208,14 +206,13 @@ static int ref_fc_uint8(struct tensor* input_tensor, struct tensor* output_tenso
     return 0;
 }
 
-
 static int ref_fc_int8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, struct fc_data* param)
 {
     int batch = param->batch;
     int hidden = param->hidden;
     int out_number = param->out_number;
 
-    int8_t* input  = (int8_t*)input_tensor->data;
+    int8_t* input = (int8_t*)input_tensor->data;
     int8_t* output = (int8_t*)output_tensor->data;
     int8_t* weight = (int8_t*)weight_tensor->data;
 
@@ -224,7 +221,7 @@ static int ref_fc_int8(struct tensor* input_tensor, struct tensor* output_tensor
     float* weight_scales = weight_tensor->scale_list;
     float* requant_scales = (float*)malloc(out_number * sizeof(float));
 
-    for (int i=0; i<out_number; i++)
+    for (int i = 0; i < out_number; i++)
         requant_scales[i] = (input_scale * weight_scales[i]) / output_scale;
 
     if (bias_tensor)
@@ -241,13 +238,13 @@ static int ref_fc_int8(struct tensor* input_tensor, struct tensor* output_tensor
                 {
                     if (param->need_trans == 0)
                     {
-                        int8_t input_i8  = input[n * hidden + j];
+                        int8_t input_i8 = input[n * hidden + j];
                         int8_t weight_i8 = weight[i * hidden + j];
                         output_i32 += (int32_t)input_i8 * (int32_t)weight_i8;
                     }
                     else
                     {
-                        int8_t input_i8  = input[n * hidden + j];
+                        int8_t input_i8 = input[n * hidden + j];
                         int8_t weight_i8 = weight[i + j * out_number];
                         output_i32 += (int32_t)input_i8 * (int32_t)weight_i8;
                     }
@@ -273,13 +270,13 @@ static int ref_fc_int8(struct tensor* input_tensor, struct tensor* output_tensor
                 {
                     if (param->need_trans == 0)
                     {
-                        int8_t input_i8  = input[n * hidden + j];
+                        int8_t input_i8 = input[n * hidden + j];
                         int8_t weight_i8 = weight[i * hidden + j];
                         output_i32 += (int32_t)input_i8 * (int32_t)weight_i8;
                     }
                     else
                     {
-                        int8_t input_i8  = input[n * hidden + j];
+                        int8_t input_i8 = input[n * hidden + j];
                         int8_t weight_i8 = weight[i + j * out_number];
                         output_i32 += (int32_t)input_i8 * (int32_t)weight_i8;
                     }
@@ -301,7 +298,7 @@ static int ref_fc_int8(struct tensor* input_tensor, struct tensor* output_tensor
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct fc_data* op_param = ( struct fc_data* )sys_malloc(sizeof(struct fc_data));
+    struct fc_data* op_param = (struct fc_data*)sys_malloc(sizeof(struct fc_data));
     memset(op_param, 0, sizeof(struct fc_data));
     exec_node->ops_priv = op_param;
     return 0;
@@ -325,8 +322,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct fc_param* param = ( struct fc_param* )ir_node->op.param_mem;
-    struct fc_data* op_param = ( struct fc_data* )exec_node->ops_priv;
+    struct fc_param* param = (struct fc_param*)ir_node->op.param_mem;
+    struct fc_data* op_param = (struct fc_data*)exec_node->ops_priv;
 
     if (ir_graph->graph_layout == TENGINE_LAYOUT_NCHW)
     {
@@ -374,8 +371,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct fc_param* param = ( struct fc_param* )ir_node->op.param_mem;
-    struct fc_data* op_param = ( struct fc_data* )exec_node->ops_priv;
+    struct fc_param* param = (struct fc_param*)ir_node->op.param_mem;
+    struct fc_data* op_param = (struct fc_data*)exec_node->ops_priv;
 
     if (ir_node->input_num > 2)
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
@@ -384,11 +381,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_fc_fp32(input_tensor, output_tensor, weight_tensor, bias_tensor, op_param);
     else if (input_tensor->data_type == TENGINE_DT_FP16)
-        #if MACOS
+#if MACOS
         TLOG_ERR("FP16 not support for mac os");
-        #else
+#else
         ret = ref_fc_fp16(input_tensor, output_tensor, weight_tensor, bias_tensor, op_param);
-        #endif
+#endif
     else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_fc_uint8(input_tensor, output_tensor, weight_tensor, bias_tensor, op_param);
     else if (input_tensor->data_type == TENGINE_DT_INT8)
diff --git a/source/device/cpu/op/fc/x86/fc_hcl_x86.c b/source/device/cpu/op/fc/x86/fc_hcl_x86.c
index 29c155676..86acbb992 100644
--- a/source/device/cpu/op/fc/x86/fc_hcl_x86.c
+++ b/source/device/cpu/op/fc/x86/fc_hcl_x86.c
@@ -38,7 +38,6 @@
 #include <math.h>
 #include <string.h>
 
-
 #if __SSE2__
 #include <emmintrin.h>
 #endif
@@ -49,11 +48,11 @@
 struct fc_data
 {
     int need_trans;
-    int batch;    // N
-    int out_number;    // OUT
-    int hidden;    // hidden
+    int batch;      // N
+    int out_number; // OUT
+    int hidden;     // hidden
     int zero[3];    // input, kernel, output
-    float scale[3];    // input, kernel, output
+    float scale[3]; // input, kernel, output
 };
 
 static int innerproduct(int inn, int inc, int inh, int inw, int outc, const float* weight, const float* input, float* output,
@@ -86,8 +85,8 @@ static int innerproduct(int inn, int inc, int inh, int inw, int outc, const floa
             _mm_storeu_ps(_sum, _sum0);
             tmp = _sum[0] + _sum[1] + _sum[2] + _sum[3];
             sum = sum + tmp;
-#else    //__AVX__
-         // TODO
+#else //__AVX__ \
+      // TODO
 #endif
 #endif
             for (; q < inc * size; q++)
@@ -105,7 +104,7 @@ static int innerproduct(int inn, int inc, int inh, int inw, int outc, const floa
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct fc_data* op_param = ( struct fc_data* )sys_malloc(sizeof(struct fc_data));
+    struct fc_data* op_param = (struct fc_data*)sys_malloc(sizeof(struct fc_data));
     memset(op_param, 0, sizeof(struct fc_data));
     exec_node->ops_priv = op_param;
     return 0;
@@ -129,8 +128,8 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct fc_param* param = ( struct fc_param* )ir_node->op.param_mem;
-    struct fc_data* op_param = ( struct fc_data* )exec_node->ops_priv;
+    struct fc_param* param = (struct fc_param*)ir_node->op.param_mem;
+    struct fc_data* op_param = (struct fc_data*)exec_node->ops_priv;
 
     if (ir_graph->graph_layout == TENGINE_LAYOUT_NCHW)
     {
@@ -174,14 +173,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* bias_tensor;
     struct tensor* output_tensor;
     int num_thread = exec_graph->num_thread;
-    int cpu_affinity = exec_graph->cpu_affinity;    
+    int cpu_affinity = exec_graph->cpu_affinity;
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct fc_param* param = ( struct fc_param* )ir_node->op.param_mem;
-    struct fc_data* op_param = ( struct fc_data* )exec_node->ops_priv;
+    struct fc_param* param = (struct fc_param*)ir_node->op.param_mem;
+    struct fc_data* op_param = (struct fc_data*)exec_node->ops_priv;
 
     const void* input_data = input_tensor->data;
     void* weight_data = weight_tensor->data;
@@ -199,8 +198,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
         bias_data = bias_tensor->data;
     }
-    if (innerproduct(batch_number, inc, inh, inw, outc, (float*)weight_data, (float*)input_data, 
-        (float*)output_data, (float*)bias_data, num_thread, cpu_affinity) < 0)
+    if (innerproduct(batch_number, inc, inh, inw, outc, (float*)weight_data, (float*)input_data,
+                     (float*)output_data, (float*)bias_data, num_thread, cpu_affinity)
+        < 0)
         return -1;
 
     return 0;
diff --git a/source/device/cpu/op/flatten/flatten_ref.c b/source/device/cpu/op/flatten/flatten_ref.c
index be26c3354..9b4476d28 100644
--- a/source/device/cpu/op/flatten/flatten_ref.c
+++ b/source/device/cpu/op/flatten/flatten_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
diff --git a/source/device/cpu/op/gather/gather_ref.c b/source/device/cpu/op/gather/gather_ref.c
index 77d29612a..5b5c9ce9e 100644
--- a/source/device/cpu/op/gather/gather_ref.c
+++ b/source/device/cpu/op/gather/gather_ref.c
@@ -38,10 +38,9 @@
 #include <math.h>
 #include <string.h>
 
-
 typedef struct
 {
-    int* in_shape;    // the dim of the input
+    int* in_shape; // the dim of the input
     int axis;
     int indices_num;
     int dim_size;
@@ -68,22 +67,23 @@ static int ref_gather_fp32(float* input, int* input_indices, float* output, gath
         // TLOG_ERR("inner_size size: %d %d \n", inner_size, param->in_shape[i]);
     }
 
-	// #pragma omp parallel for num_threads(num_thread)
-    if(param->is_onnx){
+    // #pragma omp parallel for num_threads(num_thread)
+    if (param->is_onnx)
+    {
         for (int outer = 0; outer < outer_size; ++outer)
         {
-            memcpy(out_ptr + (outer * param->indices_num ) * inner_size,
-            in_ptr + (outer* axis_size + param->indices_num) * inner_size, inner_size* sizeof(float));
+            memcpy(out_ptr + (outer * param->indices_num) * inner_size,
+                   in_ptr + (outer * axis_size + param->indices_num) * inner_size, inner_size * sizeof(float));
         }
-    } else {
+    }
+    else
+    {
         for (int outer = 0; outer < outer_size; ++outer)
         {
             for (int i = 0; i < param->indices_num; i++)
             {
-
                 memcpy(out_ptr + (outer * param->indices_num + i) * inner_size,
-                       in_ptr + (outer * axis_size + ( int )input_indices[i]) * inner_size, inner_size * sizeof(float));
-                
+                       in_ptr + (outer * axis_size + (int)input_indices[i]) * inner_size, inner_size * sizeof(float));
             }
         }
     }
@@ -109,13 +109,13 @@ static int ref_gather_uint8(uint8_t* input, int* input_indices, uint8_t* output,
         inner_size *= param->in_shape[i];
     }
 
-	// #pragma omp parallel for num_threads(num_thread)
+    // #pragma omp parallel for num_threads(num_thread)
     for (int outer = 0; outer < outer_size; ++outer)
     {
         for (int i = 0; i < param->indices_num; i++)
         {
             memcpy(out_ptr + (outer * param->indices_num + i) * inner_size,
-                   in_ptr + (outer * axis_size + ( int )input_indices[i]) * inner_size, inner_size);
+                   in_ptr + (outer * axis_size + (int)input_indices[i]) * inner_size, inner_size);
         }
     }
 
@@ -126,14 +126,14 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct gather_param* gather_param = ( struct gather_param* )ir_node->op.param_mem;
-    gather_param_t* op_priv_info = ( gather_param_t* )exec_node->ops_priv;
+    struct gather_param* gather_param = (struct gather_param*)ir_node->op.param_mem;
+    gather_param_t* op_priv_info = (gather_param_t*)exec_node->ops_priv;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 
     op_priv_info->axis = gather_param->axis;
     op_priv_info->indices_num = gather_param->indices_num;
     op_priv_info->is_onnx = gather_param->is_onnx;
-    op_priv_info->in_shape = (int*)sys_malloc(input_tensor->dim_num*sizeof(int));
+    op_priv_info->in_shape = (int*)sys_malloc(input_tensor->dim_num * sizeof(int));
     /* prerun now */
     return 0;
 }
@@ -146,7 +146,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
     struct tensor* indices_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
 
-    gather_param_t* op_priv_info = ( gather_param_t* )exec_node->ops_priv;
+    gather_param_t* op_priv_info = (gather_param_t*)exec_node->ops_priv;
 
     int out_size = input_tensor->elem_num;
 
@@ -168,7 +168,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_gather_fp32((float*)input, (int*)indices_data, (float*)output, op_priv_info, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_gather_uint8((uint8_t*)input, (int*)indices_data, (uint8_t*)output, op_priv_info, exec_graph->num_thread);
 
     return ret;
@@ -179,7 +179,7 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
 
-    gather_param_t* op_priv_info = ( gather_param_t* )sys_malloc(sizeof(gather_param_t));
+    gather_param_t* op_priv_info = (gather_param_t*)sys_malloc(sizeof(gather_param_t));
 
     if (op_priv_info == NULL)
     {
@@ -203,7 +203,7 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc
 }
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    gather_param_t* op_priv_info = ( gather_param_t* )exec_node->ops_priv;
+    gather_param_t* op_priv_info = (gather_param_t*)exec_node->ops_priv;
 
     sys_free(op_priv_info);
 
diff --git a/source/device/cpu/op/gru/gru_ref.c b/source/device/cpu/op/gru/gru_ref.c
index 47bbb624b..056882f3c 100644
--- a/source/device/cpu/op/gru/gru_ref.c
+++ b/source/device/cpu/op/gru/gru_ref.c
@@ -66,21 +66,21 @@ int ref_gru_default_fp32(struct tensor* input_tensor, struct tensor* w, struct t
 
     /* initial_h_data buffers */
     float* initial_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    float* output_h_data  = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    float* h_0            = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    memset(initial_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(output_h_data,  0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(h_0,            0, (unsigned long)hidden_size*batch_size * sizeof(float));
+    float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    float* h_0 = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(initial_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(h_0, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
 
-    float* Z_data = ( float* )malloc(hidden_size * sizeof(float));
-    float* R_data = ( float* )malloc(hidden_size * sizeof(float));
-    float* H_data = ( float* )malloc(hidden_size * sizeof(float));
+    float* Z_data = (float*)malloc(hidden_size * sizeof(float));
+    float* R_data = (float*)malloc(hidden_size * sizeof(float));
+    float* H_data = (float*)malloc(hidden_size * sizeof(float));
 
     int T = input_tensor->dims[1];
-    
-    for(int seq = 0; seq < input_tensor->dims[0]; seq++)
+
+    for (int seq = 0; seq < input_tensor->dims[0]; seq++)
     {
-        for(int t = 0; t < T; t++)
+        for (int t = 0; t < T; t++)
         {
             for (int q = 0; q < hidden_size; q++)
             {
@@ -98,7 +98,7 @@ int ref_gru_default_fp32(struct tensor* input_tensor, struct tensor* w, struct t
 
                 for (int h = 0; h < hidden_size; h++)
                 {
-                    if(seq == 0)
+                    if (seq == 0)
                     {
                         float h_i = initial_h_data[t * hidden_size + h];
                         Z += h_i * r_data[(hidden_size * 0 + q) * hidden_size + h];
@@ -115,7 +115,7 @@ int ref_gru_default_fp32(struct tensor* input_tensor, struct tensor* w, struct t
                 float r_tmp = 1.f / (1.f + exp(-R));
                 for (int k = 0; k < hidden_size; k++)
                 {
-                    if(seq == 0)
+                    if (seq == 0)
                     {
                         r_H += r_tmp * initial_h_data[t * hidden_size + k] * r_data[(hidden_size * 2 + q) * hidden_size + k];
                     }
@@ -132,7 +132,7 @@ int ref_gru_default_fp32(struct tensor* input_tensor, struct tensor* w, struct t
 
             for (int h = 0; h < hidden_size; h++)
             {
-                if(seq == 0)
+                if (seq == 0)
                 {
                     float Z = 1.f / (1.f + exp(-Z_data[h]));
                     float H = tanh(H_data[h]);
@@ -167,7 +167,7 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct
     int batch_size = input_tensor->dims[1];
     int size = input_tensor->dims[2];
     int hidden_size = param->hidden_size;
-    
+
     float* x_data = (float*)input_tensor->data;
     float* w_data = (float*)w->data;
     float* r_data = (float*)r->data;
@@ -176,21 +176,21 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct
 
     /* initial_h_data buffers */
     float* initial_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    float* output_h_data  = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    float* h_0            = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    memset(initial_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(output_h_data,  0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(h_0,            0, (unsigned long)hidden_size*batch_size * sizeof(float));
+    float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    float* h_0 = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(initial_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(h_0, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
 
-    float* Z_data = ( float* )malloc(hidden_size * sizeof(float));
-    float* R_data = ( float* )malloc(hidden_size * sizeof(float));
-    float* H_data = ( float* )malloc(hidden_size * sizeof(float));
+    float* Z_data = (float*)malloc(hidden_size * sizeof(float));
+    float* R_data = (float*)malloc(hidden_size * sizeof(float));
+    float* H_data = (float*)malloc(hidden_size * sizeof(float));
 
     int T = input_tensor->dims[1];
-    
-    for(int seq = 0; seq < input_tensor->dims[0]; seq++)
+
+    for (int seq = 0; seq < input_tensor->dims[0]; seq++)
     {
-        for(int t = 0; t < T; t++)
+        for (int t = 0; t < T; t++)
         {
             for (int q = 0; q < hidden_size; q++)
             {
@@ -213,7 +213,7 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct
 
                 for (int h = 0; h < hidden_size; h++)
                 {
-                    if(seq == 0)
+                    if (seq == 0)
                     {
                         float h_i = initial_h_data[t * hidden_size + h];
                         Z += h_i * r_data[(hidden_size * 0 + q) * hidden_size + h];
@@ -233,7 +233,7 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct
                 float r_tmp = 1.f / (1.f + exp(-R));
                 for (int k = 0; k < hidden_size; k++)
                 {
-                    if(seq == 0)
+                    if (seq == 0)
                     {
                         r_H += r_tmp * initial_h_data[t * hidden_size + k] * r_data[(hidden_size * 2 + q) * hidden_size + k];
                     }
@@ -250,7 +250,7 @@ int ref_gru_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struct
 
             for (int h = 0; h < hidden_size; h++)
             {
-                if(seq == 0)
+                if (seq == 0)
                 {
                     float Z = 1.f / (1.f + exp(-Z_data[h]));
                     float H = tanh(H_data[h]);
@@ -291,23 +291,23 @@ int ref_gru_case1_fp32(struct tensor* input_tensor, struct tensor* w, struct ten
 
     /* initial_h_data buffers */
     float* initial_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    float* output_h_data  = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    float* h_0            = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    float* h_0 = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
     memset(initial_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
-    memset(output_h_data,  0, (unsigned long)hidden_size * batch_size * sizeof(float));
-    memset(h_0,            0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(h_0, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
 
-    float* Z_data = ( float* )malloc(hidden_size * sizeof(float));
-    float* R_data = ( float* )malloc(hidden_size * sizeof(float));
-    float* H_data = ( float* )malloc(hidden_size * sizeof(float));
+    float* Z_data = (float*)malloc(hidden_size * sizeof(float));
+    float* R_data = (float*)malloc(hidden_size * sizeof(float));
+    float* H_data = (float*)malloc(hidden_size * sizeof(float));
 
     float* output_data = (float*)output_tensor->data;
     int T = input_tensor->dims[1];
     int size = input_tensor->dims[2];
 
-    for(int seq = 0; seq < input_tensor->dims[0]; seq++)
+    for (int seq = 0; seq < input_tensor->dims[0]; seq++)
     {
-        for(int t = 0; t < T; t++)
+        for (int t = 0; t < T; t++)
         {
             for (int q = 0; q < hidden_size; q++)
             {
@@ -329,7 +329,7 @@ int ref_gru_case1_fp32(struct tensor* input_tensor, struct tensor* w, struct ten
 
                 for (int h = 0; h < hidden_size; h++)
                 {
-                    if(seq == 0)
+                    if (seq == 0)
                     {
                         float h_i = initial_h_data[t * hidden_size + h];
                         Z += h_i * r_data[(hidden_size * 0 + q) * hidden_size + h];
@@ -357,7 +357,7 @@ int ref_gru_case1_fp32(struct tensor* input_tensor, struct tensor* w, struct ten
 
             for (int h = 0; h < hidden_size; h++)
             {
-                if(seq == 0)
+                if (seq == 0)
                 {
                     float Z = 1.f / (1.f + exp(-Z_data[h]));
                     float H = tanh(H_data[h]);
@@ -402,7 +402,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (ir_node->input_num > 3)
         b = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[3]);
 
-    struct gru_param* param = ( struct gru_param* )(ir_node->op.param_mem);
+    struct gru_param* param = (struct gru_param*)(ir_node->op.param_mem);
 
     /* only support one way */
     if (w->dim_num == 4 && w->dims[0] == 2)
diff --git a/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c b/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c
index 0d5c260d9..a0b3849e8 100644
--- a/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c
+++ b/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -75,7 +74,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct hard_sigmoid_param* param = ( struct hard_sigmoid_param* )ir_node->op.param_mem;
+    struct hard_sigmoid_param* param = (struct hard_sigmoid_param*)ir_node->op.param_mem;
 
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
@@ -86,7 +85,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         return -1;
     }
 
-    return ret;    
+    return ret;
 }
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
diff --git a/source/device/cpu/op/hardswish/hardswish_kernel_ref.h b/source/device/cpu/op/hardswish/hardswish_kernel_ref.h
index 6d6924206..7fe84eeb6 100644
--- a/source/device/cpu/op/hardswish/hardswish_kernel_ref.h
+++ b/source/device/cpu/op/hardswish/hardswish_kernel_ref.h
@@ -25,12 +25,10 @@
 #ifndef __HARDSWISH_KERNEL_REF_H__
 #define __HARDSWISH_KERNEL_REF_H__
 
-
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int ref_hardswish_fp32(struct tensor* input_tensor, struct tensor* output_tensor);
 
 int ref_hardswish_uint8(struct tensor* input_tensor, struct tensor* output_tensor);
diff --git a/source/device/cpu/op/hardswish/hardswish_kernel_ref_fp32.c b/source/device/cpu/op/hardswish/hardswish_kernel_ref_fp32.c
index d5139fe2f..e2c103897 100644
--- a/source/device/cpu/op/hardswish/hardswish_kernel_ref_fp32.c
+++ b/source/device/cpu/op/hardswish/hardswish_kernel_ref_fp32.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 int ref_hardswish_fp32(struct tensor* input_tensor, struct tensor* output_tensor)
 {
     float* input_data = (float*)input_tensor->data;
@@ -53,7 +52,7 @@ int ref_hardswish_fp32(struct tensor* input_tensor, struct tensor* output_tensor
             tmp = 0.f;
         if (tmp > 6.f)
             tmp = 6.f;
-        
+
         output_data[i] = input_data[i] * (tmp / 6.f);
     }
 
diff --git a/source/device/cpu/op/hardswish/hardswish_kernel_ref_uint8.c b/source/device/cpu/op/hardswish/hardswish_kernel_ref_uint8.c
index 777304fd6..7252b433b 100644
--- a/source/device/cpu/op/hardswish/hardswish_kernel_ref_uint8.c
+++ b/source/device/cpu/op/hardswish/hardswish_kernel_ref_uint8.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 int ref_hardswish_uint8(struct tensor* input_tensor, struct tensor* output_tensor)
 {
     int size = input_tensor->elem_num;
@@ -53,8 +52,8 @@ int ref_hardswish_uint8(struct tensor* input_tensor, struct tensor* output_tenso
 
     float* data_fp32 = (float*)sys_malloc(size * sizeof(float));
 
-    for(int i = 0; i < size; i++)
-        data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale;
+    for (int i = 0; i < size; i++)
+        data_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
 
     for (int i = 0; i < size; i++)
     {
@@ -69,7 +68,7 @@ int ref_hardswish_uint8(struct tensor* input_tensor, struct tensor* output_tenso
     }
 
     // quant
-    for(int i=0; i<size; i++)
+    for (int i = 0; i < size; i++)
     {
         int udata = round(data_fp32[i] / output_scale + output_zero);
         if (udata > 255)
diff --git a/source/device/cpu/op/hardswish/hardswish_ref.c b/source/device/cpu/op/hardswish/hardswish_ref.c
index c836bcad6..3a1910c39 100644
--- a/source/device/cpu/op/hardswish/hardswish_ref.c
+++ b/source/device/cpu/op/hardswish/hardswish_ref.c
@@ -22,7 +22,6 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "hardswish_kernel_ref.h"
 
 #include "graph/tensor.h"
@@ -35,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -61,7 +59,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_hardswish_fp32(input_tensor, output_tensor);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_hardswish_uint8(input_tensor, output_tensor);
     else
         TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);
diff --git a/source/device/cpu/op/input/input_ref.c b/source/device/cpu/op/input/input_ref.c
index 4754fff7c..4118be0da 100644
--- a/source/device/cpu/op/input/input_ref.c
+++ b/source/device/cpu/op/input/input_ref.c
@@ -32,7 +32,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     exec_node->inplace_map[0] = 0;
diff --git a/source/device/cpu/op/instancenorm/instancenorm_ref.c b/source/device/cpu/op/instancenorm/instancenorm_ref.c
index 32f4b4c4d..16fbd563f 100644
--- a/source/device/cpu/op/instancenorm/instancenorm_ref.c
+++ b/source/device/cpu/op/instancenorm/instancenorm_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -100,7 +99,7 @@ int ref_instancenorm_fp32(float* input_data, float* output_data, float* gamma_da
 }
 
 int ref_instancenorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* gamma_tensor, struct tensor* beta_tensor,
-                            float eps, float scale, float zero_point, int layout)
+                           float eps, float scale, float zero_point, int layout)
 {
     int n = input_tensor->dims[0];
     int channels = input_tensor->dims[1];
@@ -121,10 +120,10 @@ int ref_instancenorm_uint8(struct tensor* input_tensor, struct tensor* output_te
     int32_t input_zero = input_tensor->zero_point;
     int32_t output_zero = output_tensor->zero_point;
 
-    float* input_data = (float*) sys_malloc(total_size * sizeof(float));
-    float* output_data = (float*) sys_malloc(total_size * sizeof(float));
-    for(int i = 0; i < total_size; i++)
-        input_data[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale;
+    float* input_data = (float*)sys_malloc(total_size * sizeof(float));
+    float* output_data = (float*)sys_malloc(total_size * sizeof(float));
+    for (int i = 0; i < total_size; i++)
+        input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
 
     for (int s = 0; s < n; s++)
     {
@@ -168,7 +167,7 @@ int ref_instancenorm_uint8(struct tensor* input_tensor, struct tensor* output_te
     }
 
     // quant
-    for(int i=0; i<total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
         int udata = (int)roundf(output_data[i] / output_scale + output_zero);
         if (udata > 255)
@@ -206,15 +205,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     void* beta_data = beta_tensor->data;
     void* gamma_data = gamma_tensor->data;
 
-    struct instancenorm_Param* param = ( struct instancenorm_Param* )node->op.param_mem;
+    struct instancenorm_Param* param = (struct instancenorm_Param*)node->op.param_mem;
     float eps = param->eps;
     float scale = 1.f;
     int zero_point = 0;
-  
+
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_instancenorm_fp32(in_data, out_data, gamma_data, beta_data, size, c, n, eps, scale, zero_point, 0);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_instancenorm_uint8(input_tensor, output_tensor, gamma_tensor, beta_tensor, eps, scale, zero_point, 0);
 
     return ret;
diff --git a/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c b/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c
index bf6bb1cfe..c7fc11e26 100644
--- a/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c
+++ b/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c
@@ -34,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct interp_param* interp_param = ( struct interp_param* )ir_node->op.param_mem;
+    struct interp_param* interp_param = (struct interp_param*)ir_node->op.param_mem;
 
     int num_thread = exec_graph->num_thread;
 
diff --git a/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.c b/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.c
index c3f6647b2..508567ac1 100644
--- a/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.c
+++ b/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.c
@@ -30,16 +30,15 @@
 
 #include <arm_neon.h>
 
-
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
 static void linear_coeffs(int w, int outw, int* xofs, float* alpha)
 {
-    double scale = ( double )w / outw;
+    double scale = (double)w / outw;
 
     for (int dx = 0; dx < outw; dx++)
     {
-        float fx = ( float )((dx) * scale);
+        float fx = (float)((dx)*scale);
         int sx = floor(fx);
         fx -= sx;
 
@@ -64,12 +63,12 @@ static void linear_coeffs(int w, int outw, int* xofs, float* alpha)
 static void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, float* beta, int* yofs, int out_h,
                                   int out_w, int in_h, int in_w)
 {
-    int w = out_w;    // dst.w;
-    int h = out_h;    // dst.h;
+    int w = out_w; // dst.w;
+    int h = out_h; // dst.h;
 
     // loop body
-    float* rowsbuf0 = ( float* )sys_malloc(w * sizeof(float));
-    float* rowsbuf1 = ( float* )sys_malloc(w * sizeof(float));
+    float* rowsbuf0 = (float*)sys_malloc(w * sizeof(float));
+    float* rowsbuf1 = (float*)sys_malloc(w * sizeof(float));
     float* rows0 = rowsbuf0;
     float* rows1 = rowsbuf1;
 
@@ -89,7 +88,7 @@ static void resize_bilinear_image(float* src, float* dst, float* alpha, int* xof
             float* rows0_old = rows0;
             rows0 = rows1;
             rows1 = rows0_old;
-            const float* S1 = src + (sy + 1) * in_w;    // src.row(sy+1);
+            const float* S1 = src + (sy + 1) * in_w; // src.row(sy+1);
 
             const float* alphap = alpha;
             float* rows1p = rows1;
@@ -117,8 +116,8 @@ static void resize_bilinear_image(float* src, float* dst, float* alpha, int* xof
         else
         {
             // hresize two rows
-            const float* S0 = src + sy * in_w;    // src.row(sy);
-            const float* S1 = src + (sy + 1) * in_w;    // src.row(sy+1);
+            const float* S0 = src + sy * in_w;       // src.row(sy);
+            const float* S1 = src + (sy + 1) * in_w; // src.row(sy+1);
 
             const float* alphap = alpha;
             float* rows0p = rows0;
@@ -160,7 +159,7 @@ static void resize_bilinear_image(float* src, float* dst, float* alpha, int* xof
 
         float* rows0p = rows0;
         float* rows1p = rows1;
-        float* Dp = dst + dy * out_w;    // dst.row(dy);
+        float* Dp = dst + dy * out_w; // dst.row(dy);
 
         int nn = w >> 3;
         int remain = w - (nn << 3);
@@ -215,11 +214,11 @@ static inline void interpolate_cubic(float fx, float* coeffs)
 }
 static void cubic_coeffs(int w, int outw, int* xofs, float* alpha)
 {
-    double scale = ( double )w / outw;
+    double scale = (double)w / outw;
 
     for (int dx = 0; dx < outw; dx++)
     {
-        float fx = ( float )((dx + 0.5) * scale - 0.5);
+        float fx = (float)((dx + 0.5) * scale - 0.5);
         int sx = floor(fx);
         fx -= sx;
 
@@ -265,14 +264,14 @@ static void cubic_coeffs(int w, int outw, int* xofs, float* alpha)
 static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs, float* beta, int* yofs, int out_h,
                                  int out_w, int in_h, int in_w)
 {
-    int w = out_w;    // dst.w;
-    int h = out_h;    // dst.h;
+    int w = out_w; // dst.w;
+    int h = out_h; // dst.h;
 
     // loop body
-    float* rowsbuf0 = ( float* )sys_malloc(w * sizeof(float));
-    float* rowsbuf1 = ( float* )sys_malloc(w * sizeof(float));
-    float* rowsbuf2 = ( float* )sys_malloc(w * sizeof(float));
-    float* rowsbuf3 = ( float* )sys_malloc(w * sizeof(float));
+    float* rowsbuf0 = (float*)sys_malloc(w * sizeof(float));
+    float* rowsbuf1 = (float*)sys_malloc(w * sizeof(float));
+    float* rowsbuf2 = (float*)sys_malloc(w * sizeof(float));
+    float* rowsbuf3 = (float*)sys_malloc(w * sizeof(float));
     float* rows0 = rowsbuf0;
     float* rows1 = rowsbuf1;
     float* rows2 = rowsbuf2;
@@ -296,7 +295,7 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs
             rows1 = rows2;
             rows2 = rows3;
             rows3 = rows0_old;
-            const float* S3 = src + (sy + 2) * in_w;    // src.row(sy+2);
+            const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2);
 
             const float* alphap = alpha;
             float* rows3p = rows3;
@@ -323,8 +322,8 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs
             rows1 = rows3;
             rows2 = rows0_old;
             rows3 = rows1_old;
-            const float* S2 = src + (sy + 1) * in_w;    // src.row(sy+1);
-            const float* S3 = src + (sy + 2) * in_w;    // src.row(sy+2);
+            const float* S2 = src + (sy + 1) * in_w; // src.row(sy+1);
+            const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2);
 
             const float* alphap = alpha;
             float* rows2p = rows2;
@@ -355,9 +354,9 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs
             rows1 = rows0_old;
             rows2 = rows1_old;
             rows3 = rows2_old;
-            const float* S1 = src + sy * in_w;    // src.row(sy);
-            const float* S2 = src + (sy + 1) * in_w;    // src.row(sy+1);
-            const float* S3 = src + (sy + 2) * in_w;    // src.row(sy+2);
+            const float* S1 = src + sy * in_w;       // src.row(sy);
+            const float* S2 = src + (sy + 1) * in_w; // src.row(sy+1);
+            const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2);
 
             const float* alphap = alpha;
             float* rows1p = rows1;
@@ -384,10 +383,10 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs
         else
         {
             // hresize four rows
-            const float* S0 = src + (sy - 1) * in_w;    // src.row(sy-1);
-            const float* S1 = src + sy * in_w;    // src.row(sy);
-            const float* S2 = src + (sy + 1) * in_w;    // src.row(sy+1);
-            const float* S3 = src + (sy + 2) * in_w;    // src.row(sy+2);
+            const float* S0 = src + (sy - 1) * in_w; // src.row(sy-1);
+            const float* S1 = src + sy * in_w;       // src.row(sy);
+            const float* S2 = src + (sy + 1) * in_w; // src.row(sy+1);
+            const float* S3 = src + (sy + 2) * in_w; // src.row(sy+2);
 
             const float* alphap = alpha;
             float* rows0p = rows0;
@@ -427,7 +426,7 @@ static void resize_bicubic_image(float* src, float* dst, float* alpha, int* xofs
         float* rows1p = rows1;
         float* rows2p = rows2;
         float* rows3p = rows3;
-        float* Dp = dst + dy * out_w;    // dst.row(dy);
+        float* Dp = dst + dy * out_w; // dst.row(dy);
         for (int dx = 0; dx < w; dx++)
         {
             *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3;
@@ -455,8 +454,8 @@ int interp_run(struct tensor* output_tensor, struct tensor* input_tensor, struct
     int in_h = input_tensor->dims[2];
     int in_w = input_tensor->dims[3];
 
-    float* data = ( float* )input_tensor->data;
-    float* out_data = ( float* )output_tensor->data;
+    float* data = (float*)input_tensor->data;
+    float* out_data = (float*)output_tensor->data;
 
     if (out_h == 0 || out_w == 0)
     {
@@ -483,31 +482,31 @@ int interp_run(struct tensor* output_tensor, struct tensor* input_tensor, struct
         return 0;
     }
 
-    if (resize_type == 1)    // nearest
+    if (resize_type == 1) // nearest
     {
 #pragma omp parallel for num_threads(num_thread)
         for (int q = 0; q < in_c; q++)
         {
             for (int y = 0; y < out_h; ++y)
             {
-                const int in_y = MIN(( int )(y / height_scale), (in_h - 1));
+                const int in_y = MIN((int)(y / height_scale), (in_h - 1));
                 for (int x = 0; x < out_w; ++x)
                 {
-                    const int in_x = MIN(( int )(x / width_scale), (in_w - 1));
+                    const int in_x = MIN((int)(x / width_scale), (in_w - 1));
                     out_data[out_w * y + x + out_w * out_h * q] = data[in_y * in_w + in_x + q * in_w * in_h];
                 }
             }
         }
     }
-    else if (resize_type == 2)    // bilinear
+    else if (resize_type == 2) // bilinear
     {
-        int* buf = ( int* )sys_malloc((out_w + out_h + out_w * 2 + out_h * 2) * sizeof(int));
+        int* buf = (int*)sys_malloc((out_w + out_h + out_w * 2 + out_h * 2) * sizeof(int));
 
-        int* xofs = buf;    // new int[ow];
-        int* yofs = buf + out_w;    // new int[oh];
+        int* xofs = buf;         // new int[ow];
+        int* yofs = buf + out_w; // new int[oh];
 
-        float* alpha = ( float* )(buf + out_w + out_h);    // new float[ow * 2];
-        float* beta = ( float* )(buf + out_w + out_h + out_w * 2);    // new float[oh * 2];
+        float* alpha = (float*)(buf + out_w + out_h);            // new float[ow * 2];
+        float* beta = (float*)(buf + out_w + out_h + out_w * 2); // new float[oh * 2];
 
         linear_coeffs(in_w, out_w, xofs, alpha);
         linear_coeffs(in_h, out_h, yofs, beta);
@@ -521,15 +520,15 @@ int interp_run(struct tensor* output_tensor, struct tensor* input_tensor, struct
 
         sys_free(buf);
     }
-    else if (resize_type == 3)    // bicubic
+    else if (resize_type == 3) // bicubic
     {
-        int* buf = ( int* )sys_malloc((out_w + out_h + out_w * 4 + out_h * 4) * sizeof(int));
+        int* buf = (int*)sys_malloc((out_w + out_h + out_w * 4 + out_h * 4) * sizeof(int));
 
-        int* xofs = buf;    // new int[ow];
-        int* yofs = buf + out_w;    // new int[oh];
+        int* xofs = buf;         // new int[ow];
+        int* yofs = buf + out_w; // new int[oh];
 
-        float* alpha = ( float* )(buf + out_w + out_h);    // new float[ow * 4];
-        float* beta = ( float* )(buf + out_w + out_h + out_w * 4);    // new float[oh * 4];
+        float* alpha = (float*)(buf + out_w + out_h);            // new float[ow * 4];
+        float* beta = (float*)(buf + out_w + out_h + out_w * 4); // new float[oh * 4];
 
         cubic_coeffs(in_w, out_w, xofs, alpha);
         cubic_coeffs(in_h, out_h, yofs, beta);
diff --git a/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.h b/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.h
index afcf57ede..66ec13fa7 100644
--- a/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.h
+++ b/source/device/cpu/op/interp/cortex-a/interp_kernel_arm.h
@@ -29,7 +29,6 @@
 
 #include "graph/tensor.h"
 
-
 int interp_run(struct tensor* output_tensor, struct tensor* input_tensor, struct interp_param* interp_param,
                int num_thread);
 
diff --git a/source/device/cpu/op/interp/interp_ref.c b/source/device/cpu/op/interp/interp_ref.c
index d77c31c5e..791ae6df8 100644
--- a/source/device/cpu/op/interp/interp_ref.c
+++ b/source/device/cpu/op/interp/interp_ref.c
@@ -37,25 +37,24 @@
 #include <math.h>
 #include <string.h>
 
-
 #define INTERP_MIN(a, b) ((a) < (b) ? (a) : (b))
 
 void linear_coeffs(int w, int outw, int* xofs, float* alpha)
 {
-    double scale = ( double )w / outw;
+    double scale = (double)w / outw;
 
-    for(int dx = 0; dx < outw; dx++)
+    for (int dx = 0; dx < outw; dx++)
     {
-        float fx = ( float )((dx) * scale);
+        float fx = (float)((dx)*scale);
         int sx = floor(fx);
         fx -= sx;
 
-        if(sx < 0)
+        if (sx < 0)
         {
             sx = 0;
             fx = 0.f;
         }
-        if(sx >= w - 1)
+        if (sx >= w - 1)
         {
             sx = w - 2;
             fx = 1.f;
@@ -70,12 +69,12 @@ void linear_coeffs(int w, int outw, int* xofs, float* alpha)
 
 void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, float* beta, int* yofs, int out_h, int out_w, int in_h, int in_w)
 {
-    int w = out_w;  //dst.w;
-    int h = out_h;  //dst.h;
+    int w = out_w; //dst.w;
+    int h = out_h; //dst.h;
 
     // loop body
-    float* rowsbuf0 = ( float* )sys_malloc(w * sizeof(float));
-    float* rowsbuf1 = ( float* )sys_malloc(w * sizeof(float));
+    float* rowsbuf0 = (float*)sys_malloc(w * sizeof(float));
+    float* rowsbuf1 = (float*)sys_malloc(w * sizeof(float));
     float* rows0 = rowsbuf0;
     float* rows1 = rowsbuf1;
 
@@ -83,7 +82,7 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa
     memset(rowsbuf1, 0, w * sizeof(float));
 
     int prev_sy1 = -2;
-    for (int dy = 0; dy < h; dy++ )
+    for (int dy = 0; dy < h; dy++)
     {
         int sy = yofs[dy];
 
@@ -97,7 +96,7 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa
             float* rows0_old = rows0;
             rows0 = rows1;
             rows1 = rows0_old;
-            const float* S1 = src + (sy+1)*in_w;   //src.row(sy+1);
+            const float* S1 = src + (sy + 1) * in_w; //src.row(sy+1);
 
             const float* alphap = alpha;
             float* rows1p = rows1;
@@ -109,7 +108,7 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa
 
                 float a0 = alphap[0];
                 float a1 = alphap[1];
-                rows1p[dx] = S1p[0]*a0 + S1p[1]*a1;
+                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
 
                 alphap += 2;
             }
@@ -117,8 +116,8 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa
         else
         {
             // hresize two rows
-            const float* S0 = src + sy*in_w;       //src.row(sy);
-            const float* S1 = src + (sy+1)*in_w;   //src.row(sy+1);
+            const float* S0 = src + sy * in_w;       //src.row(sy);
+            const float* S1 = src + (sy + 1) * in_w; //src.row(sy+1);
 
             const float* alphap = alpha;
             float* rows0p = rows0;
@@ -132,12 +131,11 @@ void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, floa
 
                 float a0 = alphap[0];
                 float a1 = alphap[1];
-                rows0p[dx] = S0p[0]*a0 + S0p[1]*a1;
-                rows1p[dx] = S1p[0]*a0 + S1p[1]*a1;
+                rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
+                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
 
                 alphap += 2;
             }
-
         }
 
         prev_sy1 = sy;
@@ -210,7 +208,7 @@ int ref_interp_fp32(struct tensor* input_tensor, struct tensor* output_tensor, s
         int in_channel_size = in_h * in_w;
         int out_channel_size = out_h * out_w;
 
-        int* buf = (int*)sys_malloc((param->output_width + param->output_height + param->output_width*2 + param->output_height*2)*sizeof(float));
+        int* buf = (int*)sys_malloc((param->output_width + param->output_height + param->output_width * 2 + param->output_height * 2) * sizeof(float));
 
         if (buf == NULL)
         {
@@ -218,18 +216,18 @@ int ref_interp_fp32(struct tensor* input_tensor, struct tensor* output_tensor, s
             return -1;
         }
 
-        int* xofs = buf;//new int[ow];
-        int* yofs = buf + param->output_width ;//new int[oh];
+        int* xofs = buf;                       //new int[ow];
+        int* yofs = buf + param->output_width; //new int[oh];
 
-        float* alpha = (float*)(buf + param->output_width  + param->output_height);//new float[ow * 2];
-        float* beta = (float*)(buf + param->output_width + param->output_height + param->output_width*2);//new float[oh * 2];
+        float* alpha = (float*)(buf + param->output_width + param->output_height);                          //new float[ow * 2];
+        float* beta = (float*)(buf + param->output_width + param->output_height + param->output_width * 2); //new float[oh * 2];
 
         linear_coeffs(in_w, out_w, xofs, alpha);
         linear_coeffs(in_h, out_h, yofs, beta);
 
         for (int q = 0; q < channel; ++q)
         {
-            resize_bilinear_image(input+in_channel_size*q, output+out_channel_size*q, alpha, xofs, beta, yofs, out_h, out_w, in_h, in_w);
+            resize_bilinear_image(input + in_channel_size * q, output + out_channel_size * q, alpha, xofs, beta, yofs, out_h, out_w, in_h, in_w);
         }
 
         sys_free(buf);
@@ -259,9 +257,9 @@ int ref_interp_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
     float* input_fp32 = (float*)sys_malloc(input_total_size * sizeof(float));
     float* output_fp32 = (float*)sys_malloc(output_total_size * sizeof(float));
 
-    for(int i=0; i<input_total_size; i++)
+    for (int i = 0; i < input_total_size; i++)
     {
-        input_fp32[i] = ((float )input_uint8[i] - (float )input_zero) * input_scale;
+        input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     /* process */
@@ -304,7 +302,7 @@ int ref_interp_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
         int in_channel_size = in_h * in_w;
         int out_channel_size = out_h * out_w;
 
-        int* buf = (int*)sys_malloc((param->output_width + param->output_height + param->output_width*2 + param->output_height*2)*sizeof(float));
+        int* buf = (int*)sys_malloc((param->output_width + param->output_height + param->output_width * 2 + param->output_height * 2) * sizeof(float));
 
         if (buf == NULL)
         {
@@ -312,18 +310,18 @@ int ref_interp_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
             return -1;
         }
 
-        int* xofs = buf;//new int[ow];
-        int* yofs = buf + param->output_width ;//new int[oh];
+        int* xofs = buf;                       //new int[ow];
+        int* yofs = buf + param->output_width; //new int[oh];
 
-        float* alpha = (float*)(buf + param->output_width  + param->output_height);//new float[ow * 2];
-        float* beta = (float*)(buf + param->output_width + param->output_height + param->output_width*2);//new float[oh * 2];
+        float* alpha = (float*)(buf + param->output_width + param->output_height);                          //new float[ow * 2];
+        float* beta = (float*)(buf + param->output_width + param->output_height + param->output_width * 2); //new float[oh * 2];
 
         linear_coeffs(in_w, out_w, xofs, alpha);
         linear_coeffs(in_h, out_h, yofs, beta);
 
         for (int q = 0; q < channel; ++q)
         {
-            resize_bilinear_image(input_fp32+in_channel_size*q, output_fp32+out_channel_size*q, alpha, xofs, beta, yofs, out_h, out_w, in_h, in_w);
+            resize_bilinear_image(input_fp32 + in_channel_size * q, output_fp32 + out_channel_size * q, alpha, xofs, beta, yofs, out_h, out_w, in_h, in_w);
         }
 
         sys_free(buf);
@@ -335,7 +333,7 @@ int ref_interp_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
     }
 
     /* quant */
-    for(int i=0; i<output_total_size; i++)
+    for (int i = 0; i < output_total_size; i++)
     {
         int udata = round(output_fp32[i] / output_scale + output_zero);
         if (udata > 255)
@@ -372,7 +370,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct graph* graph = node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    struct interp_param* param = ( struct interp_param* )node->op.param_mem;
+    struct interp_param* param = (struct interp_param*)node->op.param_mem;
 
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
@@ -381,7 +379,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         ret = ref_interp_uint8(input_tensor, output_tensor, param);
     else
         TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);
-    
+
     return ret;
 }
 
diff --git a/source/device/cpu/op/l2normalization/l2normalization_ref.c b/source/device/cpu/op/l2normalization/l2normalization_ref.c
index c10793290..ac368086a 100644
--- a/source/device/cpu/op/l2normalization/l2normalization_ref.c
+++ b/source/device/cpu/op/l2normalization/l2normalization_ref.c
@@ -34,17 +34,16 @@
 
 #include <math.h>
 
-
 int ref_l2normalization_fp32(float* input_data, float* output_data, int size, int channel_size)
 {
     float sq_l2_norm = 0;
-    for(int j = 0; j < channel_size; j++)
+    for (int j = 0; j < channel_size; j++)
     {
         const float val = input_data[j];
         sq_l2_norm += val * val;
     }
     const float l2_norm = sqrt(sq_l2_norm);
-    for(int j = 0; j < channel_size; j++)
+    for (int j = 0; j < channel_size; j++)
     {
         output_data[j] = input_data[j] / l2_norm;
     }
@@ -74,7 +73,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int input_size = 1;
     int channel_size = input_tensor->dims[1];
 
-    for(int i = 0; i < input_tensor->dim_num; i++){
+    for (int i = 0; i < input_tensor->dim_num; i++)
+    {
         input_size *= input_tensor->dims[i];
     }
 
diff --git a/source/device/cpu/op/l2pool/l2pool_ref.c b/source/device/cpu/op/l2pool/l2pool_ref.c
index 6cc5e96c3..5cf027d70 100644
--- a/source/device/cpu/op/l2pool/l2pool_ref.c
+++ b/source/device/cpu/op/l2pool/l2pool_ref.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 struct ref_l2pool_param
 {
     int inc;
@@ -53,7 +52,7 @@ struct ref_l2pool_param
     int pad_h;
     int pad_w;
     int inn;
-    float scale[2]; // scale[0]: input scale, scale[1]: output scale
+    float scale[2];    // scale[0]: input scale, scale[1]: output scale
     int zero_point[2]; // zero_point[0]: input zero_point, zero_point[1]: output zero_point
 };
 #define L2POOL_MAX(a, b) ((a) < (b) ? (b) : (a))
@@ -61,8 +60,7 @@ struct ref_l2pool_param
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct ref_l2pool_param* l2pool_op_param =
-        (struct ref_l2pool_param*)sys_malloc(sizeof(struct ref_l2pool_param));
+    struct ref_l2pool_param* l2pool_op_param = (struct ref_l2pool_param*)sys_malloc(sizeof(struct ref_l2pool_param));
     memset(l2pool_op_param, 0, sizeof(struct ref_l2pool_param));
     exec_node->ops_priv = l2pool_op_param;
     return 0;
@@ -75,11 +73,11 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 }
 void run_l2pool(float* data, float* out_data, struct ref_l2pool_param* param)
 {
-    for(int c = 0; c < param->inc; c++)
+    for (int c = 0; c < param->inc; c++)
     {
-        for(int ph = 0; ph < param->outh; ph++)
+        for (int ph = 0; ph < param->outh; ph++)
         {
-            for(int pw = 0; pw < param->outw; pw++)
+            for (int pw = 0; pw < param->outw; pw++)
             {
                 // int index = inc * (ph * outw + pw) + c;
                 int index = param->inc * (ph * param->outw + pw) + c;
@@ -95,9 +93,9 @@ void run_l2pool(float* data, float* out_data, struct ref_l2pool_param* param)
 
                 float tmp = 0.0f;
                 float val = 0.0f;
-                for(int h = h_start; h < h_end; h++)
+                for (int h = h_start; h < h_end; h++)
                 {
-                    for(int w = w_start; w < w_end; w++)
+                    for (int w = w_start; w < w_end; w++)
                     {
                         // val = data[i*param->inh*param->inc * param->inw +h * param->inc * param->inw + w * param->inc
                         // +c];
@@ -106,7 +104,7 @@ void run_l2pool(float* data, float* out_data, struct ref_l2pool_param* param)
                         pool_size++;
                     }
                 }
-                if(tmp == 0)
+                if (tmp == 0)
                 {
                     out_data[index] = 0;
                 }
@@ -123,41 +121,37 @@ int ref_l2pool_fp32(float* data, float* out_data, struct ref_l2pool_param* param
 {
     int input_size = param->inc * param->inh * param->inw;
     int output_size = param->outh * param->outw * param->outc;
-    for(int i = 0; i < param->inn; i++)
+    for (int i = 0; i < param->inn; i++)
     {
-        run_l2pool(data + i * input_size, out_data + i * output_size,param);
+        run_l2pool(data + i * input_size, out_data + i * output_size, param);
     }
     return 0;
 }
 
-
-void ConvertPaddingStyleToParameters(int stride_h, int stride_w, 
-                                         int in_height, int in_width, int filter_height, int filter_width, int paddingtype,
-                                         int out_height, int out_width,
-                                         int* padding_width, int* padding_height)
+void ConvertPaddingStyleToParameters(int stride_h, int stride_w,
+                                     int in_height, int in_width, int filter_height, int filter_width, int paddingtype,
+                                     int out_height, int out_width,
+                                     int* padding_width, int* padding_height)
 {
-    if(paddingtype == 0 || paddingtype == 2)
+    if (paddingtype == 0 || paddingtype == 2)
     {
         *padding_width = 0;
         *padding_height = 0;
     }
-    else if(paddingtype == 1)
+    else if (paddingtype == 1)
     {
         *padding_width = (int)(((out_width - 1) * stride_w + filter_width - in_width) / 2);
-        *padding_height = (int)(((out_height - 1) * stride_h + filter_height - in_height)/2);
+        *padding_height = (int)(((out_height - 1) * stride_h + filter_height - in_height) / 2);
     }
 
     return;
 }
 
-
-
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
 }
 
-
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -180,10 +174,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int output_c = output_tensor->dims[1];
     int padding_w = 0;
     int padding_h = 0;
-    
+
     ConvertPaddingStyleToParameters(l2pool_param_op->stride_h, l2pool_param_op->stride_w, input_h, input_w,
-                                     l2pool_param_op->kernel_h, l2pool_param_op->kernel_w, l2pool_param_op->paddingType,
-                                     output_h, output_w, &padding_w, &padding_h);
+                                    l2pool_param_op->kernel_h, l2pool_param_op->kernel_w, l2pool_param_op->paddingType,
+                                    output_h, output_w, &padding_w, &padding_h);
 
     op_param->inc = input_c;
     op_param->inh = input_h;
diff --git a/source/device/cpu/op/logical/logical_ref.c b/source/device/cpu/op/logical/logical_ref.c
index 94cdb8bdd..aef2ad3f7 100644
--- a/source/device/cpu/op/logical/logical_ref.c
+++ b/source/device/cpu/op/logical/logical_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 struct logical_param_ref
 {
     int type;
@@ -118,19 +117,19 @@ static int ref_logical_fp32(float* input0, float* input1, float* output, struct
 
     switch (param->type)
     {
-        case 0:    // LogicalAnd
-        {
-            logical_and(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, output);
-            break;
-        }
-        case 1:    // LogicalOr
-        {
-            logical_or(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, output);
-            break;
-        }
-        default:
-            return -1;
-            ;
+    case 0: // LogicalAnd
+    {
+        logical_and(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, output);
+        break;
+    }
+    case 1: // LogicalOr
+    {
+        logical_or(input_hw, input_hw_1, input_count4, input1_count4, input0, input1, output);
+        break;
+    }
+    default:
+        return -1;
+        ;
     }
     return 0;
 }
@@ -169,7 +168,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor1 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct logical_param* logical_param = ( struct logical_param* )ir_node->op.param_mem;
+    struct logical_param* logical_param = (struct logical_param*)ir_node->op.param_mem;
     struct logical_param_ref logical_param_ref;
 
     logical_param_ref.shape0[0] = 1;
@@ -182,28 +181,28 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     logical_param_ref.shape1[2] = 1;
     logical_param_ref.shape1[3] = 1;
 
-    if (input_tensor0->dims[0] !=0)
+    if (input_tensor0->dims[0] != 0)
         logical_param_ref.shape0[0] = input_tensor0->dims[0];
-    if (input_tensor0->dims[1] !=0)
+    if (input_tensor0->dims[1] != 0)
         logical_param_ref.shape0[1] = input_tensor0->dims[1];
-    if (input_tensor0->dims[2] !=0)
+    if (input_tensor0->dims[2] != 0)
         logical_param_ref.shape0[2] = input_tensor0->dims[2];
-    if (input_tensor0->dims[3] !=0)
+    if (input_tensor0->dims[3] != 0)
         logical_param_ref.shape0[3] = input_tensor0->dims[3];
 
-    if (input_tensor1->dims[0] !=0)
+    if (input_tensor1->dims[0] != 0)
         logical_param_ref.shape1[0] = input_tensor1->dims[0];
-    if (input_tensor1->dims[1] !=0)
+    if (input_tensor1->dims[1] != 0)
         logical_param_ref.shape1[1] = input_tensor1->dims[1];
-    if (input_tensor1->dims[2] !=0)
+    if (input_tensor1->dims[2] != 0)
         logical_param_ref.shape1[2] = input_tensor1->dims[2];
-    if (input_tensor1->dims[3] !=0)
+    if (input_tensor1->dims[3] != 0)
         logical_param_ref.shape1[3] = input_tensor1->dims[3];
 
     logical_param_ref.type = logical_param->type;
 
-    int ret = ref_logical_fp32((float*)input_tensor0->data, (float*)input_tensor1->data, 
-        (float*)output_tensor->data, &logical_param_ref, exec_graph->num_thread);
+    int ret = ref_logical_fp32((float*)input_tensor0->data, (float*)input_tensor1->data,
+                               (float*)output_tensor->data, &logical_param_ref, exec_graph->num_thread);
     if (ret != 0)
         return -1;
 
diff --git a/source/device/cpu/op/logistic/logistic_ref.c b/source/device/cpu/op/logistic/logistic_ref.c
index 4d363ed1a..807ff90d9 100644
--- a/source/device/cpu/op/logistic/logistic_ref.c
+++ b/source/device/cpu/op/logistic/logistic_ref.c
@@ -34,12 +34,11 @@
 
 #include <math.h>
 
-
 struct logical_param
 {
     int out_size;
     float scale[2];    // scale[0]: input scale, scale[1]: output scale
-    int zero_point[2];    // zero_point[0]: input zero_point, zero_point[1]: output zero_point
+    int zero_point[2]; // zero_point[0]: input zero_point, zero_point[1]: output zero_point
 };
 
 static int ref_logistic_fp32(float* input_data, float* output_data, struct logical_param* op_param)
@@ -58,9 +57,7 @@ static int ref_logistic_uint8(uint8_t* input, uint8_t* output, struct logical_pa
     for (int i = 0; i < op_param->out_size; i++)
     {
         /* get max */
-        output[i] =
-            (1.f / (1.f + exp(-(input[i] - (double )op_param->zero_point[0]) * op_param->scale[0]))) / op_param->scale[1] +
-            op_param->zero_point[1];
+        output[i] = (1.f / (1.f + exp(-(input[i] - (double)op_param->zero_point[0]) * op_param->scale[0]))) / op_param->scale[1] + op_param->zero_point[1];
     }
 
     return 0;
diff --git a/source/device/cpu/op/logsoftmax/logsoftmax_ref.c b/source/device/cpu/op/logsoftmax/logsoftmax_ref.c
index 3558efe6b..2af74c63d 100644
--- a/source/device/cpu/op/logsoftmax/logsoftmax_ref.c
+++ b/source/device/cpu/op/logsoftmax/logsoftmax_ref.c
@@ -37,35 +37,33 @@
 #include <math.h>
 #include <string.h>
 
-
 struct ref_logsoftmax_param
 {
     int axis;
     int in_size;
     int on_size;
     int out_size;
-    float scale[2]; // scale[0]: input scale, scale[1]: output scale
+    float scale[2];    // scale[0]: input scale, scale[1]: output scale
     int zero_point[2]; // zero_point[0]: input zero_point, zero_point[1]: output zero_point
 };
 
 static void GetMaxArray(float* input, float* array, int in_size, int on_size)
 {
-    float* input_ptr = ( float* )input;
-    float* array_ptr = ( float* )array;
+    float* input_ptr = (float*)input;
+    float* array_ptr = (float*)array;
     memset(array, 0, in_size * sizeof(float));
 
-    for(int j = 0; j < on_size; j++)
-        for(int l = 0; l < in_size; l++)
+    for (int j = 0; j < on_size; j++)
+        for (int l = 0; l < in_size; l++)
         {
-            if(array_ptr[l] < input_ptr[j * in_size + l])
+            if (array_ptr[l] < input_ptr[j * in_size + l])
                 array_ptr[l] = input_ptr[j * in_size + l];
         }
 }
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct ref_logsoftmax_param* logsoftmax_op_param =
-        (struct ref_logsoftmax_param*)sys_malloc(sizeof(struct ref_logsoftmax_param));
+    struct ref_logsoftmax_param* logsoftmax_op_param = (struct ref_logsoftmax_param*)sys_malloc(sizeof(struct ref_logsoftmax_param));
     memset(logsoftmax_op_param, 0, sizeof(struct ref_logsoftmax_param));
     exec_node->ops_priv = logsoftmax_op_param;
     return 0;
@@ -79,17 +77,17 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 
 static void GetOutResult(float* input, float* output, float* array, float* sum_array, int in_size, int on_size)
 {
-    float* input_ptr = ( float* )input;
-    float* output_ptr = ( float* )output;
-    float* array_ptr = ( float* )array;
-    float* sum_array_ptr = ( float* )sum_array;
+    float* input_ptr = (float*)input;
+    float* output_ptr = (float*)output;
+    float* array_ptr = (float*)array;
+    float* sum_array_ptr = (float*)sum_array;
 
     memset(sum_array, 0x0, in_size * sizeof(float));
 
     /* get the exp and the summary */
 
-    for(int j = 0; j < on_size; j++)
-        for(int l = 0; l < in_size; l++)
+    for (int j = 0; j < on_size; j++)
+        for (int l = 0; l < in_size; l++)
         {
             int index = j * in_size + l;
             output_ptr[index] = exp(input_ptr[index] - array_ptr[l]);
@@ -97,18 +95,18 @@ static void GetOutResult(float* input, float* output, float* array, float* sum_a
         }
 
     /* the final result */
-    for(int j = 0; j < on_size; j++)
-        for(int l = 0; l < in_size; l++)
+    for (int j = 0; j < on_size; j++)
+        for (int l = 0; l < in_size; l++)
         {
             int index = j * in_size + l;
             output_ptr[index] /= sum_array_ptr[l];
-            output_ptr[index]=log(output_ptr[index]);
+            output_ptr[index] = log(output_ptr[index]);
         }
 }
 
 static int ref_logsoftmax_fp32(float* input_data, float* output_data, float* max_array, float* sum_array, struct ref_logsoftmax_param* op_param)
 {
-    for(int i = 0; i < op_param->out_size; i++)
+    for (int i = 0; i < op_param->out_size; i++)
     {
         int img_base = i * op_param->in_size * op_param->on_size;
         GetMaxArray(input_data + img_base, max_array, op_param->in_size, op_param->on_size);
@@ -149,27 +147,27 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     //
     int axis = param_->axis;
     int out_size = 1;
-    for(int i = 0; i < axis; i++)
+    for (int i = 0; i < axis; i++)
     {
         out_size *= dims[i];
     }
     int in_size = 1;
-    for(size_t i = axis + 1; i < input_tensor->dim_num; i++)
+    for (size_t i = axis + 1; i < input_tensor->dim_num; i++)
     {
         in_size *= dims[i];
     }
     int on_size = dims[axis];
 
-    max_array = ( float* )sys_malloc(in_size * sizeof(float));
-    sum_array = ( float* )sys_malloc(in_size * sizeof(float));
+    max_array = (float*)sys_malloc(in_size * sizeof(float));
+    sum_array = (float*)sys_malloc(in_size * sizeof(float));
 
     ref_logsoftmax_param.in_size = in_size;
     ref_logsoftmax_param.on_size = on_size;
 
     if (input_tensor->data_type == TENGINE_DT_FP32)
-        ref_logsoftmax_fp32((float*)input_tensor->data, (float*)output_tensor->data,max_array,sum_array, &ref_logsoftmax_param);
+        ref_logsoftmax_fp32((float*)input_tensor->data, (float*)output_tensor->data, max_array, sum_array, &ref_logsoftmax_param);
     // else
-        // ref_logistic_uint8(input_tensor->data, output_tensor->data, &logical_param);
+    // ref_logistic_uint8(input_tensor->data, output_tensor->data, &logical_param);
 
     return 0;
 }
diff --git a/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c b/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c
index f28faca6e..fc883f9f2 100644
--- a/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c
+++ b/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c
@@ -34,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct lrn_param* lrn_param = ( struct lrn_param* )ir_node->op.param_mem;
+    struct lrn_param* lrn_param = (struct lrn_param*)ir_node->op.param_mem;
 
     int ret = lrn_run(output_tensor, input_tensor, lrn_param, exec_graph->num_thread);
     if (ret != 0)
diff --git a/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.c b/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.c
index e2606992d..a86e6571a 100644
--- a/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.c
+++ b/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.c
@@ -38,7 +38,6 @@
 
 #include <arm_neon.h>
 
-
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -126,8 +125,8 @@ static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, struct tab* coeffs)
 
 static inline float32x4_t vexpq_f32(float32x4_t x)
 {
-    const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f);    // ln(2)
-    const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f);    // 1/ln(2)
+    const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f);     // ln(2)
+    const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
     const float32x4_t CONST_0 = vdupq_n_f32(0.f);
     const int32x4_t CONST_NEGATIVE_126 = vdupq_n_s32(-126);
 
@@ -147,8 +146,8 @@ static inline float32x4_t vexpq_f32(float32x4_t x)
 
 static inline float32x4_t vlogq_f32(float32x4_t x)
 {
-    const int32x4_t CONST_127 = vdupq_n_s32(127);    // 127
-    const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f);    // ln(2)
+    const int32x4_t CONST_127 = vdupq_n_s32(127);             // 127
+    const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
 
     // Extract exponent
     int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
@@ -186,7 +185,7 @@ static inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
 static void lrn_kernel(int i, int id, void* data, const float* input, float* output, float* square, float alpha,
                        float beta, float bias, int local_size, int channel_size, int channel_num, int num_thread)
 {
-    int step = (( int* )data)[0];
+    int step = ((int*)data)[0];
     const float32x4_t alpha_vec = vdupq_n_f32(alpha / local_size);
     const float32x4_t beta_vec = vdupq_n_f32(beta);
     const float32x4_t bias_vec = vdupq_n_f32(bias);
@@ -238,9 +237,9 @@ int lrn_run(struct tensor* output_tensor, struct tensor* input_tensor, struct lr
             int num_thread)
 {
     init_tab();
-    const float* input = ( float* )input_tensor->data;
-    float* output = ( float* )output_tensor->data;
-    float* square = ( float* )(malloc(input_tensor->elem_num * sizeof(float)));
+    const float* input = (float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
+    float* square = (float*)(malloc(input_tensor->elem_num * sizeof(float)));
 
     int n = input_tensor->dims[0];
     int c = input_tensor->dims[1];
diff --git a/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.h b/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.h
index 45330b725..f4c1e20ae 100644
--- a/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.h
+++ b/source/device/cpu/op/lrn/cortex-a/lrn_kernel_arm.h
@@ -31,7 +31,6 @@
 
 #include <arm_neon.h>
 
-
 struct tab
 {
     float32x4_t a0;
diff --git a/source/device/cpu/op/lrn/lrn_ref.c b/source/device/cpu/op/lrn/lrn_ref.c
index d7026e19d..ff71d6903 100644
--- a/source/device/cpu/op/lrn/lrn_ref.c
+++ b/source/device/cpu/op/lrn/lrn_ref.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 static int ref_lrn_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct lrn_param* param,
                         int num_thread)
 {
@@ -57,8 +56,8 @@ static int ref_lrn_fp32(struct tensor* input_tensor, struct tensor* output_tenso
     float* in_data = (float*)input_tensor->data;
     float* out_data = (float*)output_tensor->data;
 
-    float* square = ( float* )(malloc(img_size * sizeof(float)));
-    float* accum_square = ( float* )(malloc(channel_size * sizeof(float)));
+    float* square = (float*)(malloc(img_size * sizeof(float)));
+    float* accum_square = (float*)(malloc(channel_size * sizeof(float)));
 
     for (int i = 0; i < n; i++)
     {
@@ -130,7 +129,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct lrn_param* lrn_param = ( struct lrn_param* )ir_node->op.param_mem;
+    struct lrn_param* lrn_param = (struct lrn_param*)ir_node->op.param_mem;
 
     ref_lrn_fp32(input_tensor, output_tensor, lrn_param, exec_graph->num_thread);
 
diff --git a/source/device/cpu/op/lstm/lstm_ref.c b/source/device/cpu/op/lstm/lstm_ref.c
index 73849b33c..0367e9f56 100644
--- a/source/device/cpu/op/lstm/lstm_ref.c
+++ b/source/device/cpu/op/lstm/lstm_ref.c
@@ -56,23 +56,23 @@ int ref_lstm_default_fp32(struct tensor* input_tensor, struct tensor* w, struct
     float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
     float* output_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
 
-    memset(init_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(init_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(output_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(output_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    
+    memset(init_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(init_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(output_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+
     float* output_data = (float*)output_tensor->data;
     int T = input_tensor->dims[1];
     int size = input_tensor->dims[2];
 
-    float* i_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* f_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* o_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* g_flag = ( float* )malloc(hidden_size * sizeof(float));
+    float* i_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* f_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* o_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* g_flag = (float*)malloc(hidden_size * sizeof(float));
 
-    for(int seq = 0; seq < input_tensor->dims[0]; seq++)
+    for (int seq = 0; seq < input_tensor->dims[0]; seq++)
     {
-        for(int i = 0; i < T; i++)
+        for (int i = 0; i < T; i++)
         {
             for (int q = 0; q < hidden_size; q++)
             {
@@ -92,7 +92,7 @@ int ref_lstm_default_fp32(struct tensor* input_tensor, struct tensor* w, struct
 
                 for (int h = 0; h < hidden_size; h++)
                 {
-                    if(seq == 0)
+                    if (seq == 0)
                     {
                         float h_i = init_h_data[h + i * hidden_size];
                         I += h_i * (r_data[(hidden_size * 0 + q) * hidden_size + h]);
@@ -118,14 +118,14 @@ int ref_lstm_default_fp32(struct tensor* input_tensor, struct tensor* w, struct
 
             for (int c = 0; c < hidden_size; c++)
             {
-                if( seq == 0)
+                if (seq == 0)
                 {
                     float I = 1.f / (1.f + exp(-i_flag[c]));
                     float F = 1.f / (1.f + exp(-f_flag[c]));
                     float G = tanh(g_flag[c]);
                     float c_i = init_c_data[c + i * hidden_size];
                     float cell2 = F * c_i + I * G;
-                    float O = 1.f/(1.f + exp(-o_flag[c]));
+                    float O = 1.f / (1.f + exp(-o_flag[c]));
                     float tmp = tanh(cell2);
                     float H = O * tmp;
                     output_c_data[i * hidden_size + c] = cell2;
@@ -139,7 +139,7 @@ int ref_lstm_default_fp32(struct tensor* input_tensor, struct tensor* w, struct
                     float G = tanh(g_flag[c]);
                     float c_i = output_c_data[c + i * hidden_size];
                     float cell2 = F * c_i + I * G;
-                    float O = 1.f/(1.f + exp(-o_flag[c]));
+                    float O = 1.f / (1.f + exp(-o_flag[c]));
                     float H = O * tanh(cell2);
                     output_c_data[i * hidden_size + c] = cell2;
                     output_h_data[i * hidden_size + c] = H;
@@ -177,24 +177,24 @@ int ref_lstm_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struc
     float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
     float* output_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
 
-    memset(init_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(init_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(output_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(output_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
+    memset(init_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(init_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(output_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
 
     float* output_data = (float*)output_tensor->data;
 
     int T = input_tensor->dims[1];
     int size = input_tensor->dims[2];
 
-    float* i_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* f_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* o_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* g_flag = ( float* )malloc(hidden_size * sizeof(float));
+    float* i_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* f_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* o_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* g_flag = (float*)malloc(hidden_size * sizeof(float));
 
-    for(int seq = 0; seq < input_tensor->dims[0]; seq++)
+    for (int seq = 0; seq < input_tensor->dims[0]; seq++)
     {
-        for(int i = 0; i < T; i++)
+        for (int i = 0; i < T; i++)
         {
             for (int q = 0; q < hidden_size; q++)
             {
@@ -217,7 +217,7 @@ int ref_lstm_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struc
                 G += b_data[hidden_size * 3 + q];
                 for (int h = 0; h < hidden_size; h++)
                 {
-                    if(seq == 0)
+                    if (seq == 0)
                     {
                         float h_i = init_h_data[h + i * hidden_size];
                         I += h_i * (r_data[(hidden_size * 0 + q) * hidden_size + h]);
@@ -247,14 +247,14 @@ int ref_lstm_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struc
 
             for (int c = 0; c < hidden_size; c++)
             {
-                if( seq == 0)
+                if (seq == 0)
                 {
                     float I = 1.f / (1.f + exp(-i_flag[c]));
                     float F = 1.f / (1.f + exp(-f_flag[c]));
                     float G = tanh(g_flag[c]);
                     float c_i = init_c_data[c + i * hidden_size];
                     float cell2 = F * c_i + I * G;
-                    float O = 1.f/(1.f + exp(-o_flag[c]));
+                    float O = 1.f / (1.f + exp(-o_flag[c]));
                     float tmp = tanh(cell2);
                     float H = O * tmp;
                     output_c_data[i * hidden_size + c] = cell2;
@@ -268,7 +268,7 @@ int ref_lstm_with_bias_fp32(struct tensor* input_tensor, struct tensor* w, struc
                     float G = tanh(g_flag[c]);
                     float c_i = output_c_data[c + i * hidden_size];
                     float cell2 = F * c_i + I * G;
-                    float O = 1.f/(1.f + exp(-o_flag[c]));
+                    float O = 1.f / (1.f + exp(-o_flag[c]));
                     float H = O * tanh(cell2);
                     output_c_data[i * hidden_size + c] = cell2;
                     output_h_data[i * hidden_size + c] = H;
@@ -301,10 +301,10 @@ int ref_lstm_with_bias_case1_fp32(struct tensor* input_tensor, struct tensor* w,
     float* b_data = (float*)b->data;
 
     /* initial h, initial c buffers */
-    float* init_h_data = ( float* )malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    float* init_c_data = ( float* )malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    float* output_h_data = ( float* )malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    float* output_c_data = ( float* )malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    float* init_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    float* init_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
+    float* output_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
 
     memset(init_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
     memset(init_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
@@ -313,16 +313,16 @@ int ref_lstm_with_bias_case1_fp32(struct tensor* input_tensor, struct tensor* w,
 
     float* output_data = (float*)output_tensor->data;
 
-    float* i_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* f_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* o_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* g_flag = ( float* )malloc(hidden_size * sizeof(float));
+    float* i_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* f_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* o_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* g_flag = (float*)malloc(hidden_size * sizeof(float));
 
-    for (int seq = 0; seq < sequence_size; seq++)    // sequence
+    for (int seq = 0; seq < sequence_size; seq++) // sequence
     {
-        for (int i = 0; i < batch_size; i++)    // batch
+        for (int i = 0; i < batch_size; i++) // batch
         {
-            for (int q = 0; q < hidden_size; q++)    // hidden
+            for (int q = 0; q < hidden_size; q++) // hidden
             {
                 float I = 0;
                 float F = 0;
@@ -330,7 +330,7 @@ int ref_lstm_with_bias_case1_fp32(struct tensor* input_tensor, struct tensor* w,
                 float G = 0;
 
                 /* input fc */
-                for (int m = 0; m < size; m++)    // internal size, the same as four fc implement
+                for (int m = 0; m < size; m++) // internal size, the same as four fc implement
                 {
                     int index = seq * (batch_size * size) + i * size + m;
                     float i_data = x_data[index];
@@ -420,9 +420,9 @@ int ref_lstm_with_bias_case1_fp32(struct tensor* input_tensor, struct tensor* w,
     return 0;
 }
 
-int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w, struct tensor* r, 
-                                struct tensor* b, struct tensor* sequence_lens, struct tensor* init_h, struct tensor* init_c, struct tensor* p, 
-                                struct tensor* output_tensor, struct lstm_param* param)
+int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w, struct tensor* r,
+                                 struct tensor* b, struct tensor* sequence_lens, struct tensor* init_h, struct tensor* init_c, struct tensor* p,
+                                 struct tensor* output_tensor, struct lstm_param* param)
 {
     int batch_size = input_tensor->dims[1];
     int hidden_size = param->hidden_size;
@@ -434,25 +434,25 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w,
     float* init_h_data = (float*)init_h->data;
     float* init_c_data = (float*)init_c->data;
     float* p_data = (float*)p->data;
-    
+
     float* output_data = (float*)output_tensor->data;
 
     float* output_h_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
     float* output_c_data = (float*)malloc((unsigned long)hidden_size * batch_size * sizeof(float));
-    memset(output_h_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
-    memset(output_c_data, 0, (unsigned long)hidden_size*batch_size * sizeof(float));
+    memset(output_h_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
+    memset(output_c_data, 0, (unsigned long)hidden_size * batch_size * sizeof(float));
 
     int T = input_tensor->dims[1];
     int size = input_tensor->dims[2];
 
-    float* i_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* f_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* o_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* g_flag = ( float* )malloc(hidden_size * sizeof(float));
+    float* i_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* f_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* o_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* g_flag = (float*)malloc(hidden_size * sizeof(float));
 
-    for(int seq = 0; seq < input_tensor->dims[0]; seq++)
+    for (int seq = 0; seq < input_tensor->dims[0]; seq++)
     {
-        for(int i = 0; i < T; i++)
+        for (int i = 0; i < T; i++)
         {
             for (int q = 0; q < hidden_size; q++)
             {
@@ -475,7 +475,7 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w,
                 G += b_data[hidden_size * 3 + q];
                 for (int h = 0; h < hidden_size; h++)
                 {
-                    if(seq == 0)
+                    if (seq == 0)
                     {
                         float h_i = init_h_data[h + i * hidden_size];
                         I += h_i * (r_data[(hidden_size * 0 + q) * hidden_size + h]);
@@ -505,14 +505,14 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w,
 
             for (int c = 0; c < hidden_size; c++)
             {
-                if( seq == 0)
+                if (seq == 0)
                 {
                     float I = 1.f / (1.f + exp(-i_flag[c]));
                     float F = 1.f / (1.f + exp(-f_flag[c]));
                     float G = tanh(g_flag[c]);
                     float c_i = init_c_data[c + i * hidden_size];
                     float cell2 = F * c_i + I * G;
-                    float O = 1.f/(1.f + exp(-(o_flag[c] + p_data[0 * hidden_size + c] * cell2)));
+                    float O = 1.f / (1.f + exp(-(o_flag[c] + p_data[0 * hidden_size + c] * cell2)));
                     float tmp = tanh(cell2);
                     float H = O * tmp;
                     output_c_data[i * hidden_size + c] = cell2;
@@ -526,7 +526,7 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w,
                     float G = tanh(g_flag[c]);
                     float c_i = output_c_data[c + i * hidden_size];
                     float cell2 = F * c_i + I * G;
-                    float O = 1.f/(1.f + exp(-(o_flag[c] + p_data[2 * hidden_size + c] * cell2)));
+                    float O = 1.f / (1.f + exp(-(o_flag[c] + p_data[2 * hidden_size + c] * cell2)));
                     float H = O * tanh(cell2);
                     output_c_data[i * hidden_size + c] = cell2;
                     output_h_data[i * hidden_size + c] = H;
@@ -542,8 +542,8 @@ int ref_lstm_with_peepholes_fp32(struct tensor* input_tensor, struct tensor* w,
     free(f_flag);
     free(o_flag);
     free(g_flag);
-    
-    return 0;    
+
+    return 0;
 }
 
 int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tensor* w, struct tensor* r, struct tensor* b, struct tensor* output_tensor, struct lstm_param* param)
@@ -570,16 +570,16 @@ int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tens
     int size = input_tensor->dims[2];
     int direct_num = input_tensor->dims[0];
 
-    float* i_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* f_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* o_flag = ( float* )malloc(hidden_size * sizeof(float));
-    float* g_flag = ( float* )malloc(hidden_size * sizeof(float));
+    float* i_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* f_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* o_flag = (float*)malloc(hidden_size * sizeof(float));
+    float* g_flag = (float*)malloc(hidden_size * sizeof(float));
 
-    for(int seq = 0; seq < input_tensor->dims[0]; seq++)
+    for (int seq = 0; seq < input_tensor->dims[0]; seq++)
     {
-        for(int i = 0; i < T; i++)
+        for (int i = 0; i < T; i++)
         {
-            for(int d = 0; d < direct_num; d++)
+            for (int d = 0; d < direct_num; d++)
             {
                 for (int q = 0; q < hidden_size; q++)
                 {
@@ -602,7 +602,7 @@ int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tens
                     G += b_data[d * hidden_size * 4 * 2 + hidden_size * 3 + q];
                     for (int h = 0; h < hidden_size; h++)
                     {
-                        if(seq == 0)
+                        if (seq == 0)
                         {
                             float h_i = init_h_data[d * input_tensor->dims[1] * hidden_size + h + i * hidden_size];
                             I += h_i * (r_data[d * hidden_size * hidden_size * 4 + (hidden_size * 0 + q) * hidden_size + h]);
@@ -631,14 +631,14 @@ int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tens
                 }
                 for (int c = 0; c < hidden_size; c++)
                 {
-                    if( seq == 0)
+                    if (seq == 0)
                     {
                         float I = 1.f / (1.f + exp(-i_flag[c]));
                         float F = 1.f / (1.f + exp(-f_flag[c]));
                         float G = tanh(g_flag[c]);
                         float c_i = init_c_data[d * hidden_size * input_tensor->dims[2] + c + i * hidden_size];
                         float cell2 = F * c_i + I * G;
-                        float O = 1.f/(1.f + exp(-o_flag[c]));
+                        float O = 1.f / (1.f + exp(-o_flag[c]));
                         float tmp = tanh(cell2);
                         float H = O * tmp;
                         output_c_data[d * hidden_size * input_tensor->dims[2] + i * hidden_size + c] = cell2;
@@ -652,7 +652,7 @@ int ref_lstm_with_bias_bidirection_fp32(struct tensor* input_tensor, struct tens
                         float G = tanh(g_flag[c]);
                         float c_i = output_c_data[d * hidden_size * input_tensor->dims[2] + c + i * hidden_size];
                         float cell2 = F * c_i + I * G;
-                        float O = 1.f/(1.f + exp(-o_flag[c]));
+                        float O = 1.f / (1.f + exp(-o_flag[c]));
                         float H = O * tanh(cell2);
                         output_c_data[d * hidden_size * input_tensor->dims[2] + i * hidden_size + c] = cell2;
                         output_h_data[d * hidden_size * input_tensor->dims[2] + i * hidden_size + c] = H;
@@ -692,14 +692,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* w = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* r = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    
+
     struct tensor* b = NULL;
     struct tensor* sequence_lens = NULL;
     struct tensor* init_h = NULL;
     struct tensor* init_c = NULL;
     struct tensor* p = NULL;
 
-    lstm_param_t* param = ( struct lstm_param* )(ir_node->op.param_mem);
+    lstm_param_t* param = (struct lstm_param*)(ir_node->op.param_mem);
 
     /* only support one way */
     if (w->dim_num == 4 && w->dims[0] == 2)
@@ -745,7 +745,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct lstm_param* lstm_param = ( struct lstm_param* )(node->op.param_mem);
+    struct lstm_param* lstm_param = (struct lstm_param*)(node->op.param_mem);
 
     int batch_size = input->dims[1];
     if (lstm_param->mxnet_flag == 0)
diff --git a/source/device/cpu/op/matmul/matmul_ref.c b/source/device/cpu/op/matmul/matmul_ref.c
index e52961f3f..4f0000547 100644
--- a/source/device/cpu/op/matmul/matmul_ref.c
+++ b/source/device/cpu/op/matmul/matmul_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 struct ref_matmul_data
 {
     int batch;
@@ -117,7 +116,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     else if (dim_size == 2)
     {
         param.batch = 1;
-        param.c = 1;    // input0->Getse().Shape(0);
+        param.c = 1; // input0->Getse().Shape(0);
         param.m = input_tensor->dims[0];
         param.n = input_tensor1->dims[1];
         param.k = input_tensor->dims[1];
diff --git a/source/device/cpu/op/maximum/maximum_ref.c b/source/device/cpu/op/maximum/maximum_ref.c
index 23c02aca5..ecb34f774 100644
--- a/source/device/cpu/op/maximum/maximum_ref.c
+++ b/source/device/cpu/op/maximum/maximum_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 struct maximum_op_param
 {
     int in_num;
@@ -60,7 +59,7 @@ static int ref_maximum_fp32(const float** in_data, float* out_data, int size, co
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct maximum_op_param* maximum_op_param = ( struct maximum_op_param* )sys_malloc(sizeof(struct maximum_op_param));
+    struct maximum_op_param* maximum_op_param = (struct maximum_op_param*)sys_malloc(sizeof(struct maximum_op_param));
     exec_node->ops_priv = maximum_op_param;
 
     return 0;
@@ -76,12 +75,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct maximum_op_param* maximum_op_param = ( struct maximum_op_param* )exec_node->ops_priv;
+    struct maximum_op_param* maximum_op_param = (struct maximum_op_param*)exec_node->ops_priv;
 
     int in_num = ir_node->input_num;
 
     maximum_op_param->in_num = in_num;
-    maximum_op_param->input_data = ( void** )sys_malloc(sizeof(void*) * in_num);
+    maximum_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num);
 
     return 0;
 }
@@ -94,7 +93,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
     uint32_t elem_num = input_tensor_a->elem_num;
-    struct maximum_op_param* maximum_op_param = ( struct maximum_op_param* )exec_node->ops_priv;
+    struct maximum_op_param* maximum_op_param = (struct maximum_op_param*)exec_node->ops_priv;
     for (int i = 0; i < maximum_op_param->in_num; i++)
     {
         struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]);
@@ -102,17 +101,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         maximum_op_param->input_data[i] = data;
     }
 
-    const void** input = ( const void** )maximum_op_param->input_data;
+    const void** input = (const void**)maximum_op_param->input_data;
     float* output = (float*)output_tensor->data;
 
-    ref_maximum_fp32(( const float** )input, output, elem_num, maximum_op_param);
+    ref_maximum_fp32((const float**)input, output, elem_num, maximum_op_param);
 
     return 0;
 }
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct maximum_op_param* maximum_op_param = ( struct maximum_op_param* )exec_node->ops_priv;
+    struct maximum_op_param* maximum_op_param = (struct maximum_op_param*)exec_node->ops_priv;
 
     sys_free(maximum_op_param->input_data);
 
diff --git a/source/device/cpu/op/mean/mean_ref.c b/source/device/cpu/op/mean/mean_ref.c
index ed7bfc346..1ccd4697b 100644
--- a/source/device/cpu/op/mean/mean_ref.c
+++ b/source/device/cpu/op/mean/mean_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 struct mean_op_param
 {
     int in_num;
@@ -52,14 +51,14 @@ static int ref_mean_fp32(const float** in_data, float* out_data, int size, const
             const float* data = in_data[n];
             sum += data[i];
         }
-        out_data[i] = sum / ( float )in_num;
+        out_data[i] = sum / (float)in_num;
     }
     return 0;
 }
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct mean_op_param* mean_op_param = ( struct mean_op_param* )sys_malloc(sizeof(struct mean_op_param));
+    struct mean_op_param* mean_op_param = (struct mean_op_param*)sys_malloc(sizeof(struct mean_op_param));
     exec_node->ops_priv = mean_op_param;
     return 0;
 }
@@ -74,12 +73,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct mean_op_param* mean_op_param = ( struct mean_op_param* )exec_node->ops_priv;
+    struct mean_op_param* mean_op_param = (struct mean_op_param*)exec_node->ops_priv;
 
     int in_num = ir_node->input_num;
 
     mean_op_param->in_num = in_num;
-    mean_op_param->input_data = ( void** )sys_malloc(sizeof(void*) * in_num);
+    mean_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num);
 
     return 0;
 }
@@ -92,7 +91,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
     uint32_t elem_num = input_tensor_a->elem_num;
-    struct mean_op_param* mean_op_param = ( struct mean_op_param* )exec_node->ops_priv;
+    struct mean_op_param* mean_op_param = (struct mean_op_param*)exec_node->ops_priv;
     for (int i = 0; i < mean_op_param->in_num; i++)
     {
         struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]);
@@ -100,17 +99,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         mean_op_param->input_data[i] = data;
     }
 
-    const void** input = ( const void** )mean_op_param->input_data;
+    const void** input = (const void**)mean_op_param->input_data;
     float* output = (float*)output_tensor->data;
 
-    ref_mean_fp32(( const float** )input, output, elem_num, mean_op_param);
+    ref_mean_fp32((const float**)input, output, elem_num, mean_op_param);
 
     return 0;
 }
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct mean_op_param* mean_op_param = ( struct mean_op_param* )exec_node->ops_priv;
+    struct mean_op_param* mean_op_param = (struct mean_op_param*)exec_node->ops_priv;
 
     sys_free(mean_op_param->input_data);
 
diff --git a/source/device/cpu/op/minimum/minimum_ref.c b/source/device/cpu/op/minimum/minimum_ref.c
index 076cf851e..19319eb2f 100644
--- a/source/device/cpu/op/minimum/minimum_ref.c
+++ b/source/device/cpu/op/minimum/minimum_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 struct minimum_op_param
 {
     int in_num;
@@ -60,7 +59,7 @@ static int ref_minimum_fp32(const float** in_data, float* out_data, int size, co
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct minimum_op_param* minimum_op_param = ( struct minimum_op_param* )sys_malloc(sizeof(struct minimum_op_param));
+    struct minimum_op_param* minimum_op_param = (struct minimum_op_param*)sys_malloc(sizeof(struct minimum_op_param));
     exec_node->ops_priv = minimum_op_param;
     return 0;
 }
@@ -75,12 +74,12 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct minimum_op_param* minimum_op_param = ( struct minimum_op_param* )exec_node->ops_priv;
+    struct minimum_op_param* minimum_op_param = (struct minimum_op_param*)exec_node->ops_priv;
 
     int in_num = ir_node->input_num;
 
     minimum_op_param->in_num = in_num;
-    minimum_op_param->input_data = ( void** )sys_malloc(sizeof(void*) * in_num);
+    minimum_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num);
 
     return 0;
 }
@@ -93,7 +92,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
     uint32_t elem_num = input_tensor_a->elem_num;
-    struct minimum_op_param* minimum_op_param = ( struct minimum_op_param* )exec_node->ops_priv;
+    struct minimum_op_param* minimum_op_param = (struct minimum_op_param*)exec_node->ops_priv;
     for (int i = 0; i < minimum_op_param->in_num; i++)
     {
         struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]);
@@ -101,17 +100,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         minimum_op_param->input_data[i] = data;
     }
 
-    const void** input = ( const void** )minimum_op_param->input_data;
+    const void** input = (const void**)minimum_op_param->input_data;
     float* output = (float*)output_tensor->data;
 
-    ref_minimum_fp32(( const float** )input, output, elem_num, minimum_op_param);
+    ref_minimum_fp32((const float**)input, output, elem_num, minimum_op_param);
 
     return 0;
 }
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct minimum_op_param* minimum_op_param = ( struct minimum_op_param* )exec_node->ops_priv;
+    struct minimum_op_param* minimum_op_param = (struct minimum_op_param*)exec_node->ops_priv;
 
     sys_free(minimum_op_param->input_data);
 
diff --git a/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c b/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c
index 338c4e2ce..8e3581c24 100644
--- a/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c
+++ b/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c
@@ -34,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     exec_node->inplace_map[0] = 0;
@@ -64,8 +63,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    float* idata = ( float* )input_tensor->data;
-    float* odata = ( float* )output_tensor->data;
+    float* idata = (float*)input_tensor->data;
+    float* odata = (float*)output_tensor->data;
     if (idata != odata)
     {
         TLOG_ERR("input and output are not the same mem\n");
diff --git a/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.c b/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.c
index b50761810..f52317060 100644
--- a/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.c
+++ b/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.c
@@ -30,10 +30,9 @@
 
 #include <arm_neon.h>
 
-
 static void mish_kernel(int i, int id, void* data, const float* input, float* output)
 {
-    int step = (( int* )data)[0];
+    int step = ((int*)data)[0];
     const float* cur_input = input + id * step;
     float* cur_output = output + id * step;
     for (int i = 0; i < (step & -4); i += 4)
@@ -53,8 +52,8 @@ static void mish_kernel(int i, int id, void* data, const float* input, float* ou
 
 int mish_run(struct tensor* output_tensor, struct tensor* input_tensor, int num_thread)
 {
-    float* data = ( float* )input_tensor->data;
-    float* out_data = ( float* )output_tensor->data;
+    float* data = (float*)input_tensor->data;
+    float* out_data = (float*)output_tensor->data;
 
     int chan_num = (input_tensor->dims[0]) * (input_tensor->dims[1]);
     int chan_size = (input_tensor->dims[2]) * (input_tensor->dims[3]);
diff --git a/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.h b/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.h
index 137457a4b..b65a25a1a 100644
--- a/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.h
+++ b/source/device/cpu/op/mish/cortex-a/mish_kernel_arm.h
@@ -27,7 +27,6 @@
 
 #include "graph/tensor.h"
 
-
 int mish_run(struct tensor* output_tensor, struct tensor* input_tensor, int num_thread);
 
 #endif
diff --git a/source/device/cpu/op/mish/cortex-a/mish_math_func.h b/source/device/cpu/op/mish/cortex-a/mish_math_func.h
index 38b80187b..cd21c52c0 100644
--- a/source/device/cpu/op/mish/cortex-a/mish_math_func.h
+++ b/source/device/cpu/op/mish/cortex-a/mish_math_func.h
@@ -30,7 +30,6 @@ refer to ncnn
 
 #include <arm_neon.h>
 
-
 static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
 {
 #if __aarch64__
diff --git a/source/device/cpu/op/mish/mish_kernel_ref.h b/source/device/cpu/op/mish/mish_kernel_ref.h
index ea10ff7d3..33ae84056 100644
--- a/source/device/cpu/op/mish/mish_kernel_ref.h
+++ b/source/device/cpu/op/mish/mish_kernel_ref.h
@@ -25,14 +25,12 @@
 #ifndef __MISH_KERNEL_REF_H__
 #define __MISH_KERNEL_REF_H__
 
-
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
 
+int ref_mish_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread);
 
-int ref_mish_fp32(struct tensor *input_tensor, struct tensor *output_tensor, int num_thread);
-
-int ref_mish_uint8(struct tensor *input_tensor, struct tensor *output_tensor, int num_thread);
+int ref_mish_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread);
 
 #endif
diff --git a/source/device/cpu/op/mish/mish_kernel_ref_fp32.c b/source/device/cpu/op/mish/mish_kernel_ref_fp32.c
index c5431f2ee..2c0d7d9e0 100644
--- a/source/device/cpu/op/mish/mish_kernel_ref_fp32.c
+++ b/source/device/cpu/op/mish/mish_kernel_ref_fp32.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 int ref_mish_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     int w = input_tensor->dims[3];
diff --git a/source/device/cpu/op/mish/mish_kernel_ref_uint8.c b/source/device/cpu/op/mish/mish_kernel_ref_uint8.c
index 8f0a2b5e0..2f22f9f27 100644
--- a/source/device/cpu/op/mish/mish_kernel_ref_uint8.c
+++ b/source/device/cpu/op/mish/mish_kernel_ref_uint8.c
@@ -38,8 +38,7 @@
 
 #include <math.h>
 
-
-int ref_mish_uint8(struct tensor *input_tensor, struct tensor *output_tensor, int num_thread)
+int ref_mish_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     int w = input_tensor->dims[3];
     int h = output_tensor->dims[2];
@@ -61,9 +60,8 @@ int ref_mish_uint8(struct tensor *input_tensor, struct tensor *output_tensor, in
 
     float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float));
 
-    for(int i = 0; i < total_size; i++)
-        data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale;
-
+    for (int i = 0; i < total_size; i++)
+        data_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
 
     for (int n = 0; n < batch; n++)
     {
@@ -81,7 +79,7 @@ int ref_mish_uint8(struct tensor *input_tensor, struct tensor *output_tensor, in
     }
 
     // quant
-    for(int i=0; i<total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
         int udata = round(data_fp32[i] / output_scale + output_zero);
         if (udata > 255)
diff --git a/source/device/cpu/op/mish/mish_ref.c b/source/device/cpu/op/mish/mish_ref.c
index 7c7f2addd..91af5a417 100644
--- a/source/device/cpu/op/mish/mish_ref.c
+++ b/source/device/cpu/op/mish/mish_ref.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -57,12 +56,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
     int ret = -1;
-    if(input_tensor->data_type == TENGINE_DT_FP32)
+    if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_mish_fp32(input_tensor, output_tensor, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_mish_uint8(input_tensor, output_tensor, exec_graph->num_thread);
     else
-        TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);        
+        TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);
 
     return ret;
 }
diff --git a/source/device/cpu/op/mvn/mvn_ref.c b/source/device/cpu/op/mvn/mvn_ref.c
index 4274dc490..306082d61 100644
--- a/source/device/cpu/op/mvn/mvn_ref.c
+++ b/source/device/cpu/op/mvn/mvn_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 typedef struct _ref_mvn_param
 {
     int input_n;
@@ -69,7 +68,7 @@ int ref_mvn_fp32(float* in_data, float* out_data, p_ref_mvn_param param)
     int normalize_variance = param->normalize_variance;
     float eps = param->eps;
 
-    float* sum = ( float* )malloc(in_c * sizeof(float));
+    float* sum = (float*)malloc(in_c * sizeof(float));
 
     if (NULL == sum)
         return -100;
@@ -130,7 +129,7 @@ int ref_mvn_fp32(float* in_data, float* out_data, p_ref_mvn_param param)
 
         if (normalize_variance)
         {
-            float* sqsum = ( float* )malloc(in_c * sizeof(float));
+            float* sqsum = (float*)malloc(in_c * sizeof(float));
             if (NULL == sqsum)
                 return -100;
 
@@ -227,7 +226,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     op_param.input_h = input_tensor->dims[2];
     op_param.input_w = input_tensor->dims[3];
 
-    struct mvn_param* param = ( struct mvn_param* )node->op.param_mem;
+    struct mvn_param* param = (struct mvn_param*)node->op.param_mem;
     op_param.normalize_variance = param->normalize_variance;
     op_param.across_channels = param->across_channels;
     op_param.eps = param->eps;
diff --git a/source/device/cpu/op/noop/noop_ref.c b/source/device/cpu/op/noop/noop_ref.c
index 62385d18e..67722f5bb 100644
--- a/source/device/cpu/op/noop/noop_ref.c
+++ b/source/device/cpu/op/noop/noop_ref.c
@@ -35,7 +35,6 @@
 #include <math.h>
 #include <string.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     exec_node->inplace_map[0] = 0;
@@ -71,23 +70,26 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     switch (input_tensor->data_type)
     {
-        case TENGINE_DT_FP32:
-        case TENGINE_DT_INT32: {
-            size *= 4;
-            break;
-        }
-        case TENGINE_DT_FP16:
-        case TENGINE_DT_INT16: {
-            size *= 2;
-            break;
-        }
-        case TENGINE_DT_UINT8:
-        case TENGINE_DT_INT8: {
-            size *= 1;
-            break;
-        }
-        default:
-            return -1;
+    case TENGINE_DT_FP32:
+    case TENGINE_DT_INT32:
+    {
+        size *= 4;
+        break;
+    }
+    case TENGINE_DT_FP16:
+    case TENGINE_DT_INT16:
+    {
+        size *= 2;
+        break;
+    }
+    case TENGINE_DT_UINT8:
+    case TENGINE_DT_INT8:
+    {
+        size *= 1;
+        break;
+    }
+    default:
+        return -1;
     }
 
     if (size <= 0)
diff --git a/source/device/cpu/op/normalize/normalize_ref.c b/source/device/cpu/op/normalize/normalize_ref.c
index 52887a0b0..92990f780 100644
--- a/source/device/cpu/op/normalize/normalize_ref.c
+++ b/source/device/cpu/op/normalize/normalize_ref.c
@@ -37,12 +37,11 @@
 #include <math.h>
 #include <string.h>
 
-
 static void norm_channel(float* input, float* output, float* buffer, float* scale, int hw, int channel, int num_thread)
 {
     memset(buffer, 0, hw * sizeof(float));
 
-//#pragma omp parallel for num_threads(num_thread)
+    //#pragma omp parallel for num_threads(num_thread)
     for (int i = 0; i < channel; i++)
     {
         for (int j = 0; j < hw; j++)
@@ -52,13 +51,13 @@ static void norm_channel(float* input, float* output, float* buffer, float* scal
         }
     }
 
-//#pragma omp parallel for num_threads(num_thread)
+    //#pragma omp parallel for num_threads(num_thread)
     for (int j = 0; j < hw; j++)
     {
         buffer[j] = 1.f / sqrt(buffer[j]);
     }
 
-//#pragma omp parallel for num_threads(num_thread)
+    //#pragma omp parallel for num_threads(num_thread)
     for (int i = 0; i < channel; i++)
     {
         for (int j = 0; j < hw; j++)
@@ -86,17 +85,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
     struct tensor* scale_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
-    normalize_param_t* param = ( normalize_param_t* )(ir_node->op.param_mem);
-    float* input_org = ( float* )input_tensor->data;
-    float* output_org = ( float* )output_tensor->data;
-    float* sclae_org = ( float* )scale_tensor->data;
+    normalize_param_t* param = (normalize_param_t*)(ir_node->op.param_mem);
+    float* input_org = (float*)input_tensor->data;
+    float* output_org = (float*)output_tensor->data;
+    float* sclae_org = (float*)scale_tensor->data;
 
     int batch_number = input_tensor->dims[0];
     int channel_num = input_tensor->dims[1];
     int channel_size = (input_tensor->dims[2]) * (input_tensor->dims[3]);
     int img_size = channel_num * channel_size;
 
-    float* buffer = ( float* )sys_malloc(channel_size * sizeof(float));
+    float* buffer = (float*)sys_malloc(channel_size * sizeof(float));
     if (param->channel_shared == 0 && param->across_spatial == 0)
     {
         for (int i = 0; i < batch_number; i++)
diff --git a/source/device/cpu/op/pad/pad_ref.c b/source/device/cpu/op/pad/pad_ref.c
index ba1f48f7e..d17024b3a 100644
--- a/source/device/cpu/op/pad/pad_ref.c
+++ b/source/device/cpu/op/pad/pad_ref.c
@@ -37,7 +37,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -168,7 +167,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct pad_param* param = ( struct pad_param* )ir_node->op.param_mem;
+    struct pad_param* param = (struct pad_param*)ir_node->op.param_mem;
 
     int batch = input_tensor->dims[0];
     int channel = input_tensor->dims[1];
@@ -195,14 +194,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
             {
                 if (input_tensor->data_type == TENGINE_DT_FP32)
                 {
-                    float* input_data = ( float* )input_tensor->data + n * in_size + c * in_cstep;
-                    float* output_data = ( float* )output_tensor->data + n * out_size + c * out_cstep;
+                    float* input_data = (float*)input_tensor->data + n * in_size + c * in_cstep;
+                    float* output_data = (float*)output_tensor->data + n * out_size + c * out_cstep;
                     ref_pad_fp32(input_data, output_data, in_h, in_w, out_h, out_w, pad_top, pad_left, param->value);
                 }
-                else if(input_tensor->data_type == TENGINE_DT_UINT8)
+                else if (input_tensor->data_type == TENGINE_DT_UINT8)
                 {
-                    uint8_t* input_data = ( uint8_t* )input_tensor->data + n * in_size + c * in_cstep;
-                    uint8_t* output_data = ( uint8_t* )output_tensor->data + n * out_size + c * out_cstep;
+                    uint8_t* input_data = (uint8_t*)input_tensor->data + n * in_size + c * in_cstep;
+                    uint8_t* output_data = (uint8_t*)output_tensor->data + n * out_size + c * out_cstep;
                     ref_pad_uint8(input_data, output_data, in_h, in_w, out_h, out_w, pad_top, pad_left, param->value);
                 }
             }
@@ -223,13 +222,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops pad_node_ops = {.prerun = NULL,
-        .run = run,
-        .reshape = NULL,
-        .postrun = NULL,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score
-};
+                                       .run = run,
+                                       .reshape = NULL,
+                                       .postrun = NULL,
+                                       .init_node = init_node,
+                                       .release_node = release_node,
+                                       .score = score};
 
 int register_pad_ref_op()
 {
diff --git a/source/device/cpu/op/permute/permute_ref.c b/source/device/cpu/op/permute/permute_ref.c
index ce8869641..6e705ab31 100644
--- a/source/device/cpu/op/permute/permute_ref.c
+++ b/source/device/cpu/op/permute/permute_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static void __hwc(const float* input, float* output, int hh, int ww, int cc, int wc, int hw)
 {
     for (int h = 0; h < hh; ++h)
@@ -58,12 +57,12 @@ static void __chw(const float* input, float* output, int hh, int ww, int cc, int
 {
     for (int c = 0; c < cc; ++c)
     {
-        float* output_ptr = output + c * hw;    // chw
+        float* output_ptr = output + c * hw; // chw
         for (int h = 0; h < hh; ++h)
         {
             for (int w = 0; w < ww; ++w)
             {
-                const float* input_ptr = input + h * wc + w * cc;    // input hwc + wc
+                const float* input_ptr = input + h * wc + w * cc; // input hwc + wc
                 // hw + w = input_ptr[c]
                 output_ptr[h * ww + w] = input_ptr[c];
             }
@@ -92,12 +91,12 @@ static void __chw_u8(const uint8_t* input, uint8_t* output, int hh, int ww, int
 {
     for (int c = 0; c < cc; ++c)
     {
-        uint8_t* output_ptr = output + c * hw;    // chw
+        uint8_t* output_ptr = output + c * hw; // chw
         for (int h = 0; h < hh; ++h)
         {
             for (int w = 0; w < ww; ++w)
             {
-                const uint8_t* input_ptr = input + h * wc + w * cc;    // input hwc + wc
+                const uint8_t* input_ptr = input + h * wc + w * cc; // input hwc + wc
                 // hw + w = input_ptr[c]
                 output_ptr[h * ww + w] = input_ptr[c];
             }
@@ -126,12 +125,12 @@ static void __chw_i8(const int8_t* input, int8_t* output, int hh, int ww, int cc
 {
     for (int c = 0; c < cc; ++c)
     {
-        int8_t* output_ptr = output + c * hw;    // chw
+        int8_t* output_ptr = output + c * hw; // chw
         for (int h = 0; h < hh; ++h)
         {
             for (int w = 0; w < ww; ++w)
             {
-                const int8_t* input_ptr = input + h * wc + w * cc;    // input hwc + wc
+                const int8_t* input_ptr = input + h * wc + w * cc; // input hwc + wc
                 // hw + w = input_ptr[c]
                 output_ptr[h * ww + w] = input_ptr[c];
             }
@@ -401,7 +400,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    permute_param_t* param = ( struct permute_param* )(ir_node->op.param_mem);
+    permute_param_t* param = (struct permute_param*)(ir_node->op.param_mem);
 
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
diff --git a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c
index 6cb4e3781..4b6d3fe7a 100644
--- a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c
+++ b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c
@@ -48,7 +48,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* input_tensor;
     struct tensor* output_tensor;
 
-    struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem;
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
@@ -65,7 +65,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor;
     struct tensor* output_tensor;
 
-    struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem;
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
@@ -92,7 +92,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
-    struct pool_param* pool_param = ( struct pool_param* )exec_node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)exec_node->op.param_mem;
 
     int global = pool_param->global;
     int type = pool_param->pool_method;
@@ -104,7 +104,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     int pad_h1 = pool_param->pad_h1;
     int pad_w0 = pool_param->pad_w0;
     int pad_w1 = pool_param->pad_w1;
-    int pad_tf = pool_param->pad_h0_org;    // maybe there is a bug.
+    int pad_tf = pool_param->pad_h0_org; // maybe there is a bug.
 
     int pool_size = 0;
 
@@ -136,8 +136,8 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
         /* general max pooling, k2s2, k2k2p1, k3s1p1, k3s2, k3s2p1 */
         if (type == POOL_MAX && (pad_h0 == pad_w0) && (pad_h1 == pad_w1) && pad_tf != -1)
         {
-			if (pad_h0 == 0 && (pool_size == POOL_K2S2))
-				return 0;
+            if (pad_h0 == 0 && (pool_size == POOL_K2S2))
+                return 0;
             if (pad_h0 == 0 && (pool_size == POOL_K3S2))
                 return OPS_SCORE_BEST;
             if (pad_h0 == 1 && (pool_size == POOL_K2S2 || pool_size == POOL_K3S2 || pool_size == POOL_K3S1))
@@ -151,7 +151,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
                 return OPS_SCORE_BEST;
             if (pad_h0 == 1 && pad_h1 == 1 && (pool_size == POOL_K2S2 || pool_size == POOL_K3S2 || pool_size == POOL_K3S1))
                 return OPS_SCORE_BEST;
-            else if(pad_h0 == 0 && pad_h1 == 1 && (pool_size == POOL_K3S2))
+            else if (pad_h0 == 0 && pad_h1 == 1 && (pool_size == POOL_K3S2))
                 return OPS_SCORE_BEST;
         }
     }
@@ -159,7 +159,6 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return 0;
 }
 
-
 static struct node_ops hcl_node_ops = {.prerun = prerun,
                                        .run = run,
                                        .reshape = NULL,
@@ -168,13 +167,11 @@ static struct node_ops hcl_node_ops = {.prerun = prerun,
                                        .release_node = release_node,
                                        .score = score};
 
-
 int register_pooling_hcl_arm_op()
 {
     return register_builtin_node_ops(OP_POOL, &hcl_node_ops);
 }
 
-
 int unregister_pooling_hcl_arm_op()
 {
     return unregister_builtin_node_ops(OP_POOL, &hcl_node_ops);
diff --git a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.h b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.h
index ddc2bbedc..062e66015 100644
--- a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.h
+++ b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.h
@@ -43,10 +43,9 @@
 #include <arm_neon.h>
 
 #define POOL_GENERIC 0
-#define POOL_K2S2 1
-#define POOL_K3S2 2
-#define POOL_K3S1 3
-
+#define POOL_K2S2    1
+#define POOL_K3S2    2
+#define POOL_K3S1    3
 
 typedef void (*pooling_kernel_t)(const void* input, void* output, int inc, int inh, int inw, int outh, int outw, int,
                                  int, int, int, int, int, int pad_h1, int pad_w1, int);
@@ -318,9 +317,7 @@ static void avg_3x3s2(const float* input, float* output, int inc, int inh, int i
             }
             for (int j = block_w * 4; j < outw; j++)
             {
-                *out_ptr =
-                    (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) *
-                    0.11111111f;
+                *out_ptr = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) * 0.11111111f;
                 out_ptr++;
                 line0 += 2;
                 line1 += 2;
@@ -1242,9 +1239,7 @@ static void avg_3x3s2_p1(const float* input, float* output, int inc, int inh, in
             }
             for (int j = block_w * 4 + 1; j < outw; j++)
             {
-                *out_ptr =
-                    (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) *
-                    0.11111111f;
+                *out_ptr = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) * 0.11111111f;
                 out_ptr++;
                 line0 += 2;
                 line1 += 2;
@@ -1516,9 +1511,7 @@ static void avg_3x3s1_p1(const float* input, float* output, int inc, int inh, in
             // mid
             for (int j = 0; j < mid_w; j++)
             {
-                *out_ptr =
-                    (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) *
-                    0.11111111f;
+                *out_ptr = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]) * 0.11111111f;
                 out_ptr++;
                 line0 += 1;
                 line1 += 1;
@@ -1629,9 +1622,9 @@ int pooling_kernel_perf_prerun(struct tensor* input, struct tensor* out, struct
     if (param->global)
     {
         if (param->pool_method == POOL_AVG)
-            param->funct = ( pooling_kernel_t )avg_global;
+            param->funct = (pooling_kernel_t)avg_global;
         else if (param->pool_method == POOL_MAX)
-            param->funct = ( pooling_kernel_t )max_global;
+            param->funct = (pooling_kernel_t)max_global;
 
         assert(param->funct != NULL);
         return 0;
@@ -1659,18 +1652,18 @@ int pooling_kernel_perf_prerun(struct tensor* input, struct tensor* out, struct
             if (param->pad_h0 == 0)
             {
                 if (pool_size == POOL_K2S2)
-                    param->funct = ( pooling_kernel_t )max_2x2s2;
+                    param->funct = (pooling_kernel_t)max_2x2s2;
                 else if (pool_size == POOL_K3S2)
-                    param->funct = ( pooling_kernel_t )max_3x3s2;
+                    param->funct = (pooling_kernel_t)max_3x3s2;
             }
             else if (param->pad_h0 == 1)
             {
                 if (pool_size == POOL_K2S2)
-                    param->funct = ( pooling_kernel_t )max_2x2s2_p1;
+                    param->funct = (pooling_kernel_t)max_2x2s2_p1;
                 else if (pool_size == POOL_K3S2)
-                    param->funct = ( pooling_kernel_t )max_3x3s2_p1;
+                    param->funct = (pooling_kernel_t)max_3x3s2_p1;
                 else if (pool_size == POOL_K3S1)
-                    param->funct = ( pooling_kernel_t )max_3x3s1_p1;
+                    param->funct = (pooling_kernel_t)max_3x3s1_p1;
             }
         }
 
@@ -1691,23 +1684,23 @@ int pooling_kernel_perf_prerun(struct tensor* input, struct tensor* out, struct
             if (param->pad_h0 == 0 && param->pad_h1 == 0)
             {
                 if (pool_size == POOL_K2S2)
-                    param->funct = ( pooling_kernel_t )avg_2x2s2;
+                    param->funct = (pooling_kernel_t)avg_2x2s2;
                 else if (pool_size == POOL_K3S2)
-                    param->funct = ( pooling_kernel_t )avg_3x3s2;
+                    param->funct = (pooling_kernel_t)avg_3x3s2;
             }
             else if (param->pad_h0 == 1 && param->pad_h1 == 1)
             {
                 if (pool_size == POOL_K2S2)
-                    param->funct = ( pooling_kernel_t )avg_2x2s2_p1;
+                    param->funct = (pooling_kernel_t)avg_2x2s2_p1;
                 else if (pool_size == POOL_K3S2)
-                    param->funct = ( pooling_kernel_t )avg_3x3s2_p1;
+                    param->funct = (pooling_kernel_t)avg_3x3s2_p1;
                 else if (pool_size == POOL_K3S1)
-                    param->funct = ( pooling_kernel_t )avg_3x3s1_p1;
+                    param->funct = (pooling_kernel_t)avg_3x3s1_p1;
             }
             else if (param->pad_h0 == 0 && param->pad_h1 == 1)
             {
                 if (pool_size == POOL_K3S2)
-                    param->funct = ( pooling_kernel_t ) avg_3x3s2;
+                    param->funct = (pooling_kernel_t)avg_3x3s2;
             }
         }
 
diff --git a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm_int8.h b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm_int8.h
index bd5789084..e87487781 100644
--- a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm_int8.h
+++ b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm_int8.h
@@ -43,9 +43,9 @@
 #include <arm_neon.h>
 
 #define POOL_GENERIC 0
-#define POOL_K2S2 1
-#define POOL_K3S2 2
-#define POOL_K3S1 3
+#define POOL_K2S2    1
+#define POOL_K3S2    2
+#define POOL_K3S1    3
 
 static inline int8_t arm_max_int8(int8_t a, int8_t b)
 {
@@ -64,7 +64,7 @@ static inline int8_t arm_min_int8(int8_t a, int8_t b)
 }
 
 typedef void (*pooling_kernel_int8_t)(const void* input, void* output, int inc, int inh, int inw, int outh, int outw, int k_h,
-                                 int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale);
+                                      int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale);
 
 static void pad_0_align_2D_int8(int8_t* dst, int8_t* src, int m, int n, int m_align, int n_align, int pad_h, int pad_w)
 {
@@ -125,16 +125,16 @@ static void delete_0_3D_int8(int8_t* dst, int8_t* src, int m_align, int n_align,
 }
 
 static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h,
-                      int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale)
+                           int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale)
 {
     int in_hw = inw * inh;
     int out_hw = outh * outw;
 
-    if(pad_w1 > 0)
+    if (pad_w1 > 0)
     {
         outw--;
     }
-    if(pad_h1 > 0)
+    if (pad_h1 > 0)
     {
         outh--;
     }
@@ -142,15 +142,15 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
     int remain_w = inw - outw * 2;
     int index = 0;
 
-    for(int c = 0; c < inc; c++)
+    for (int c = 0; c < inc; c++)
     {
         index = 0;
         const int8_t* line0 = input + c * in_hw;
         const int8_t* line1 = line0 + inw;
         int8_t* out_ptr = output + c * out_hw;
-        for(int i = 0; i < outh; i++)
+        for (int i = 0; i < outh; i++)
         {
-            for(int j = 0; j < block_w; j++)
+            for (int j = 0; j < block_w; j++)
             {
                 int8x8_t p00 = vld1_s8(line0);
                 int8x8_t p10 = vld1_s8(line1);
@@ -162,18 +162,18 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 #ifdef __aarch64__
                 /* pairwaise max */
                 sum0 = vpaddq_s16(sum0, sum1);
-                for(int n = 0; n < 8; n++)
+                for (int n = 0; n < 8; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(sum0[n] / 4);
+                    out_ptr[n] = (int8_t)round(sum0[n] / 4);
                 }
 #else
                 /* pairwaise max */
                 int32x4_t suml0 = vpaddlq_s16(sum0);
                 int32x4_t suml1 = vpaddlq_s16(sum1);
-                for(int n = 0; n < 4; n++)
+                for (int n = 0; n < 4; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(suml0[n] / 4);
-                    out_ptr[n + 1] = ( int8_t )round(suml1[n] / 4);
+                    out_ptr[n] = (int8_t)round(suml0[n] / 4);
+                    out_ptr[n + 1] = (int8_t)round(suml1[n] / 4);
                 }
 #endif
                 line0 += 16;
@@ -181,7 +181,7 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 index = index + 8;
             }
             index = block_w * 8;
-            if(outw - index >= 4)
+            if (outw - index >= 4)
             {
                 int8x8_t p00 = vld1_s8(line0);
                 int8x8_t p10 = vld1_s8(line1);
@@ -190,42 +190,42 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 /* pairwaise max */
                 int16x8_t sum1 = {0};
                 sum0 = vpaddq_s16(sum0, sum1);
-                for(int n = 0; n < 4; n++)
+                for (int n = 0; n < 4; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(sum0[n] / 4);
+                    out_ptr[n] = (int8_t)round(sum0[n] / 4);
                 }
 #else
                 /* pairwaise max */
                 int32x4_t suml0 = vpaddlq_s16(sum0);
-                for(int n = 0; n < 4; n++)
+                for (int n = 0; n < 4; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(suml0[n] / 4);
+                    out_ptr[n] = (int8_t)round(suml0[n] / 4);
                 }
 #endif
                 line0 += 8;
                 out_ptr = out_ptr + 4;
                 index = index + 4;
             }
-            for(; index < outw; index++)
+            for (; index < outw; index++)
             {
-                *out_ptr = ( int8_t )round((line0[0] + line0[1] + line1[0] + line1[1]) / 4);
+                *out_ptr = (int8_t)round((line0[0] + line0[1] + line1[0] + line1[1]) / 4);
                 out_ptr++;
                 line0 += 2;
                 line1 += 2;
             }
-            if(pad_w1 > 0)
+            if (pad_w1 > 0)
             {
-                *out_ptr = ( int8_t )round((line0[0] + line1[0]) / 2);
+                *out_ptr = (int8_t)round((line0[0] + line1[0]) / 2);
                 out_ptr++;
             }
 
             line0 += remain_w + inw;
             line1 += remain_w + inw;
         }
-        if(pad_h1)
+        if (pad_h1)
         {
             index = 0;
-            for(int j = 0; j < block_w; j++)
+            for (int j = 0; j < block_w; j++)
             {
                 int8x8_t p00 = vld1_s8(line0);
                 int8x8_t p01 = vld1_s8(line0 + 8);
@@ -237,17 +237,17 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 int16x8_t sum1 = vaddl_s8(p01, p02);
 #ifdef __aarch64__
                 sum0 = vpaddq_s16(sum0, sum1);
-                for(int n = 0; n < 8; n++)
+                for (int n = 0; n < 8; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(sum0[n] / 4);
+                    out_ptr[n] = (int8_t)round(sum0[n] / 4);
                 }
 #else
                 int32x4_t suml0 = vpaddlq_s16(sum0);
                 int32x4_t suml1 = vpaddlq_s16(sum1);
-                for(int n = 0; n < 4; n++)
+                for (int n = 0; n < 4; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(suml0[n] / 4);
-                    out_ptr[n + 1] = ( int8_t )round(suml1[n] / 4);
+                    out_ptr[n] = (int8_t)round(suml0[n] / 4);
+                    out_ptr[n + 1] = (int8_t)round(suml1[n] / 4);
                 }
 #endif
                 line0 += 16;
@@ -255,7 +255,7 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 index = index + 8;
             }
             index = block_w * 8;
-            if(outw - index >= 4)
+            if (outw - index >= 4)
             {
                 int8x8_t p00 = vld1_s8(line0);
                 int8x8_t p01 = {0};
@@ -264,31 +264,31 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 /* pairwaise max */
                 int16x8_t sum1 = {0};
                 sum0 = vpaddq_s16(sum0, sum1);
-                for(int n = 0; n < 4; n++)
+                for (int n = 0; n < 4; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(sum0[n] / 4);
+                    out_ptr[n] = (int8_t)round(sum0[n] / 4);
                 }
 #else
                 /* pairwaise max */
                 int32x4_t suml0 = vpaddlq_s16(sum0);
-                for(int n = 0; n < 4; n++)
+                for (int n = 0; n < 4; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(suml0[n] / 4);
+                    out_ptr[n] = (int8_t)round(suml0[n] / 4);
                 }
 #endif
                 line0 += 8;
                 out_ptr = out_ptr + 4;
                 index = index + 4;
             }
-            for(; index < outw; index++)
+            for (; index < outw; index++)
             {
                 int sum0 = line0[0] + line0[1];
-                *out_ptr = ( int8_t )round((sum0) / 2);
+                *out_ptr = (int8_t)round((sum0) / 2);
                 out_ptr++;
                 line0 += 2;
                 line1 += 2;
             }
-            if(pad_w1 > 0)
+            if (pad_w1 > 0)
             {
                 *out_ptr = line0[0];
                 out_ptr++;
@@ -298,16 +298,16 @@ static void avg_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 }
 
 static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h,
-                      int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale)
+                           int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale)
 {
     int in_hw = inw * inh;
     int out_hw = outh * outw;
 
-    if(pad_w1 > 0)
+    if (pad_w1 > 0)
     {
         outw--;
     }
-    if(pad_h1 > 0)
+    if (pad_h1 > 0)
     {
         outh--;
     }
@@ -318,14 +318,14 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 #endif
     int remain_w = inw - outw * 2;
     int index = 0;
-    for(int c = 0; c < inc; c++)
+    for (int c = 0; c < inc; c++)
     {
         const int8_t* line0 = input + c * in_hw;
         const int8_t* line1 = line0 + inw;
         int8_t* out_ptr = output + c * out_hw;
-        for(int i = 0; i < outh; i++)
+        for (int i = 0; i < outh; i++)
         {
-            for(int j = 0; j < block_w; j++)
+            for (int j = 0; j < block_w; j++)
             {
 #ifdef __aarch64__
                 int8x16_t p00 = vld1q_s8(line0);
@@ -362,7 +362,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
             }
             index = block_w * 8;
 #endif
-            if(outw - index >= 8)
+            if (outw - index >= 8)
             {
                 int8x8_t p00 = vld1_s8(line0);
                 int8x8_t p10 = vld1_s8(line1);
@@ -380,7 +380,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 out_ptr = out_ptr + 8;
                 index = index + 8;
             }
-            if(outw - index >= 4)
+            if (outw - index >= 4)
             {
                 int8x8_t p00 = vld1_s8(line0);
                 int8x8_t p10 = vld1_s8(line1);
@@ -399,7 +399,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 out_ptr = out_ptr + 4;
                 index = index + 4;
             }
-            for(; index < outw; index++)
+            for (; index < outw; index++)
             {
                 int8_t max0 = arm_max_int8(line0[0], line0[1]);
                 int8_t max1 = arm_max_int8(line1[0], line1[1]);
@@ -409,7 +409,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 line0 += 2;
                 line1 += 2;
             }
-            if(pad_w1 > 0)
+            if (pad_w1 > 0)
             {
                 *out_ptr = arm_max_int8(line0[0], line1[0]);
                 out_ptr++;
@@ -417,9 +417,9 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
             line0 += remain_w + inw;
             line1 += remain_w + inw;
         }
-        if(pad_h1 > 0)
+        if (pad_h1 > 0)
         {
-            for(int j = 0; j < block_w; j++)
+            for (int j = 0; j < block_w; j++)
             {
 #ifdef __aarch64__
                 int8x16_t p00 = vld1q_s8(line0);
@@ -444,7 +444,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
             }
             index = block_w * 8;
 #endif
-            if(outw - index >= 8)
+            if (outw - index >= 8)
             {
                 int8x8_t p00 = vld1_s8(line0);
                 int8x8_t p01 = vld1_s8(line0 + 8);
@@ -456,7 +456,7 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 out_ptr = out_ptr + 8;
                 index = index + 8;
             }
-            if(outw - index >= 4)
+            if (outw - index >= 4)
             {
                 int8x8_t p00 = vld1_s8(line0);
                 /* pairwaise max */
@@ -472,13 +472,13 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 out_ptr = out_ptr + 4;
                 index = index + 4;
             }
-            for(; index < outw; index++)
+            for (; index < outw; index++)
             {
                 *out_ptr = arm_max_int8(line0[0], line0[1]);
                 out_ptr++;
                 line0 += 2;
             }
-            if(pad_w1 > 0)
+            if (pad_w1 > 0)
             {
                 *out_ptr = arm_max_int8(line0[0], line1[0]);
                 out_ptr++;
@@ -488,32 +488,32 @@ static void max_2x2s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 }
 
 static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h,
-                      int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale)
+                           int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale)
 {
     int in_hw = inw * inh;
     int out_hw = outh * outw;
 
-    if(pad_w1 > 0)
+    if (pad_w1 > 0)
     {
         outw--;
     }
-    if(pad_h1 > 0)
+    if (pad_h1 > 0)
     {
         outh--;
     }
     int block_w = outw >> 3;
     int remain_w = inw - outw * 2;
     int index = 0;
-    for(int c = 0; c < inc; c++)
+    for (int c = 0; c < inc; c++)
     {
         const int8_t* line0 = input + c * in_hw;
         const int8_t* line1 = line0 + inw;
         const int8_t* line2 = line1 + inw;
         int8_t* out_ptr = output + c * out_hw;
-        for(int i = 0; i < outh; i++)
+        for (int i = 0; i < outh; i++)
         {
             index = 0;
-            for(int j = 0; j < block_w; j++)
+            for (int j = 0; j < block_w; j++)
             {
                 int8x8x2_t p00 = vld2_s8(line0);
                 int8x8x2_t p10 = vld2_s8(line1);
@@ -538,9 +538,9 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 
                 // sum0 = vadd_s8(vadd_s8(sum0, sum1), sum2);
 
-                for(int n = 0; n < 8; n++)
+                for (int n = 0; n < 8; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(sum0[n] / 9);
+                    out_ptr[n] = (int8_t)round(sum0[n] / 9);
                 }
 
                 p00 = p00_new;
@@ -555,36 +555,35 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 index = index + 8;
             }
 
-            for(; index < outw; index++)
+            for (; index < outw; index++)
             {
-                int sum =
-                    (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]);
-                *out_ptr = ( int8_t )round(sum / 9);
+                int sum = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2] + line2[0] + line2[1] + line2[2]);
+                *out_ptr = (int8_t)round(sum / 9);
                 out_ptr++;
                 line0 += 2;
                 line1 += 2;
                 line2 += 2;
             }
-            if(pad_w1 == 1)
+            if (pad_w1 == 1)
             {
                 int sum = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2]);
-                *out_ptr = ( int8_t )round(sum / 6);
+                *out_ptr = (int8_t)round(sum / 6);
                 out_ptr++;
             }
-            else if(pad_w1 == 2)
+            else if (pad_w1 == 2)
             {
                 int sum = (line0[0] + line1[0] + line2[0]);
-                *out_ptr = ( int8_t )round(sum / 6);
+                *out_ptr = (int8_t)round(sum / 6);
                 out_ptr++;
             }
             line0 += remain_w + inw;
             line1 += remain_w + inw;
             line2 += remain_w + inw;
         }
-        if(pad_h1 == 1)
+        if (pad_h1 == 1)
         {
             index = 0;
-            for(int j = 0; j < block_w; j++)
+            for (int j = 0; j < block_w; j++)
             {
                 int8x8x2_t p00 = vld2_s8(line0);
                 int8x8x2_t p10 = vld2_s8(line1);
@@ -600,9 +599,9 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 int8x8_t p11 = vext_s8(p10.val[0], p10_new.val[0], 1);
                 sum0 = vaddw_s8(sum0, p11);
 
-                for(int n = 0; n < 8; n++)
+                for (int n = 0; n < 8; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(sum0[n] / 6);
+                    out_ptr[n] = (int8_t)round(sum0[n] / 6);
                 }
 
                 p00 = p00_new;
@@ -612,31 +611,31 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 out_ptr += 8;
                 index = index + 8;
             }
-            for(; index < outw; index++)
+            for (; index < outw; index++)
             {
                 int sum = (line0[0] + line0[1] + line0[2] + line1[0] + line1[1] + line1[2]);
-                *out_ptr = ( int8_t )round(sum / 6);
+                *out_ptr = (int8_t)round(sum / 6);
                 out_ptr++;
                 line0 += 2;
                 line1 += 2;
             }
-            if(pad_w1 == 1)
+            if (pad_w1 == 1)
             {
                 int sum = (line0[0] + line0[1] + line1[0] + line1[1]);
-                *out_ptr = ( int8_t )round(sum / 4);
+                *out_ptr = (int8_t)round(sum / 4);
                 out_ptr++;
             }
-            else if(pad_w1 == 2)
+            else if (pad_w1 == 2)
             {
                 int sum = (line0[0] + line1[0]);
-                *out_ptr = ( int8_t )round(sum / 2);
+                *out_ptr = (int8_t)round(sum / 2);
                 out_ptr++;
             }
         }
-        else if(pad_h1 == 2)
+        else if (pad_h1 == 2)
         {
             index = 0;
-            for(int j = 0; j < block_w; j++)
+            for (int j = 0; j < block_w; j++)
             {
                 int8x8x2_t p00 = vld2_s8(line0);
                 int8x8x2_t p00_new = vld2_s8(line0 + 16);
@@ -644,9 +643,9 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 int8x8_t p01 = vext_s8(p00.val[0], p00_new.val[0], 1);
                 sum0 = vaddw_s8(sum0, p01);
 
-                for(int n = 0; n < 8; n++)
+                for (int n = 0; n < 8; n++)
                 {
-                    out_ptr[n] = ( int8_t )round(sum0[n] / 3);
+                    out_ptr[n] = (int8_t)round(sum0[n] / 3);
                 }
 
                 p00 = p00_new;
@@ -654,18 +653,18 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 out_ptr += 8;
                 index = index + 8;
             }
-            for(; index < outw; index++)
+            for (; index < outw; index++)
             {
-                *out_ptr = ( int8_t )round((line0[0] + line0[1] + line0[2]) / 3);
+                *out_ptr = (int8_t)round((line0[0] + line0[1] + line0[2]) / 3);
                 out_ptr++;
                 line0 += 2;
             }
-            if(pad_w1 == 1)
+            if (pad_w1 == 1)
             {
-                *out_ptr = ( int8_t )round((line0[0] + line0[1]) / 2);
+                *out_ptr = (int8_t)round((line0[0] + line0[1]) / 2);
                 out_ptr++;
             }
-            else if(pad_w1 == 2)
+            else if (pad_w1 == 2)
             {
                 *out_ptr = line0[0];
                 out_ptr++;
@@ -675,16 +674,16 @@ static void avg_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 }
 
 static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h,
-                      int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale)
+                           int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale)
 {
     int in_hw = inw * inh;
     int out_hw = outh * outw;
 
-    if(pad_w1 > 0)
+    if (pad_w1 > 0)
     {
         outw--;
     }
-    if(pad_h1 > 0)
+    if (pad_h1 > 0)
     {
         outh--;
     }
@@ -693,18 +692,18 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 
     int index = 0;
 
-    for(int c = 0; c < inc; c++)
+    for (int c = 0; c < inc; c++)
     {
         const int8_t* line0 = input + c * in_hw;
         const int8_t* line1 = line0 + inw;
         const int8_t* line2 = line1 + inw;
         int8_t* out_ptr = output + c * out_hw;
-        for(int i = 0; i < outh; i++)
+        for (int i = 0; i < outh; i++)
         {
             int8x16x2_t p00 = vld2q_s8(line0);
             int8x16x2_t p10 = vld2q_s8(line1);
             int8x16x2_t p20 = vld2q_s8(line2);
-            for(int j = 0; j < block_w; j++)
+            for (int j = 0; j < block_w; j++)
             {
                 /*
                 p00     = [1,2,3,4,5,6,7,8...]
@@ -745,7 +744,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 
             index = block_w * 16;
 
-            if(outw - index > 8)
+            if (outw - index > 8)
             {
                 int8x8x2_t p00 = vld2_s8(line0);
                 int8x8x2_t p10 = vld2_s8(line1);
@@ -779,7 +778,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 out_ptr += 8;
                 index = index + 8;
             }
-            for(; index < outw; index++)
+            for (; index < outw; index++)
             {
                 int8_t max0 = arm_max_int8(arm_max_int8(line0[0], line0[1]), line0[2]);
                 int8_t max1 = arm_max_int8(arm_max_int8(line1[0], line1[1]), line1[2]);
@@ -791,7 +790,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 line1 += 2;
                 line2 += 2;
             }
-            if(pad_w1 == 1)
+            if (pad_w1 == 1)
             {
                 int8_t max0 = arm_max_int8(arm_max_int8(line0[0], line0[1]), arm_max_int8(line1[0], line1[1]));
                 *out_ptr = arm_max_int8(arm_max_int8(line2[0], line2[1]), max0);
@@ -801,11 +800,11 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
             line1 += remain_w + inw;
             line2 += remain_w + inw;
         }
-        if(pad_h1 == 1)
+        if (pad_h1 == 1)
         {
             int8x16x2_t p00 = vld2q_s8(line0);
             int8x16x2_t p10 = vld2q_s8(line1);
-            for(int j = 0; j < block_w; j++)
+            for (int j = 0; j < block_w; j++)
             {
                 int8x16x2_t p00_new = vld2q_s8(line0 + 32);
                 int8x16_t max0 = vmaxq_s8(p00.val[0], p00.val[1]);
@@ -830,7 +829,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 
             index = block_w * 16;
 
-            if(outw - index > 8)
+            if (outw - index > 8)
             {
                 int8x8x2_t p00 = vld2_s8(line0);
                 int8x8x2_t p10 = vld2_s8(line1);
@@ -856,7 +855,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 out_ptr += 8;
                 index = index + 8;
             }
-            for(; index < outw; index++)
+            for (; index < outw; index++)
             {
                 int8_t max0 = arm_max_int8(arm_max_int8(line0[0], line0[1]), line0[2]);
                 int8_t max1 = arm_max_int8(arm_max_int8(line1[0], line1[1]), line1[2]);
@@ -865,7 +864,7 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
                 line0 += 2;
                 line1 += 2;
             }
-            if(pad_w1 == 1)
+            if (pad_w1 == 1)
             {
                 *out_ptr = arm_max_int8(arm_max_int8(line0[0], line0[1]), arm_max_int8(line1[0], line1[1]));
                 out_ptr++;
@@ -875,18 +874,18 @@ static void max_3x3s2_int8(const int8_t* input, int8_t* output, int inc, int inh
 }
 
 static void avg_global_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h,
-                       int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale)
+                            int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale)
 {
     int in_hw = inw * inh;
     int block = in_hw >> 4;
 
-    for(int c = 0; c < inc; c++)
+    for (int c = 0; c < inc; c++)
     {
         int index = 0;
         const int8_t* line0 = input + c * in_hw;
         int8_t* out_ptr = output + c;
         int sum = 0;
-        for(int j = 0; j < block; j++)
+        for (int j = 0; j < block; j++)
         {
             int8x8_t p00 = vld1_s8(line0);
             int8x8_t p01 = vld1_s8(line0 + 8);
@@ -897,30 +896,30 @@ static void avg_global_int8(const int8_t* input, int8_t* output, int inc, int in
         }
         index = block * 16;
 
-        for(int j = index; j < in_hw; j++)
+        for (int j = index; j < in_hw; j++)
         {
             sum += line0[0];
             line0++;
         }
         float sum_fp32 = sum * in_scale;
-        sum_fp32 = sum_fp32/in_hw;
-        int tmp = (int)round(sum_fp32/out_scale);
-        if(tmp > 127)
+        sum_fp32 = sum_fp32 / in_hw;
+        int tmp = (int)round(sum_fp32 / out_scale);
+        if (tmp > 127)
             tmp = 127;
-        else if(tmp < -127)
+        else if (tmp < -127)
             tmp = -127;
 
-        *out_ptr = ( int8_t )tmp;//round(sum / in_hw);
+        *out_ptr = (int8_t)tmp; //round(sum / in_hw);
     }
 }
 
 static void max_global_int8(const int8_t* input, int8_t* output, int inc, int inh, int inw, int outh, int outw, int k_h,
-                       int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale,float out_scale)
+                            int k_w, int s_h, int s_w, int pad_h0, int pad_w0, int pad_h1, int pad_w1, int is_caffe, float in_scale, float out_scale)
 {
     int in_hw = inw * inh;
     int block = in_hw >> 5;
 
-    for(int c = 0; c < inc; c++)
+    for (int c = 0; c < inc; c++)
     {
         int index = 0;
         const int8_t* line0 = input + c * in_hw;
@@ -928,7 +927,7 @@ static void max_global_int8(const int8_t* input, int8_t* output, int inc, int in
 
         int8x16_t p00 = vld1q_s8(line0);
         int8x16_t res = p00;
-        for(int j = 0; j < block; j++)
+        for (int j = 0; j < block; j++)
         {
             int8x16_t p00 = vld1q_s8(line0);
             int8x16_t p01 = vld1q_s8(line0 + 16);
@@ -937,11 +936,11 @@ static void max_global_int8(const int8_t* input, int8_t* output, int inc, int in
             line0 += 32;
         }
         int8_t max_ = 0;
-        if(block > 0)
+        if (block > 0)
         {
             max_ = res[0];
 #ifdef __aarch64__
-            for(int n = 1; n < 16; n++)
+            for (int n = 1; n < 16; n++)
             {
                 max_ = arm_max_int8(max_, res[n]);
             }
@@ -969,7 +968,7 @@ static void max_global_int8(const int8_t* input, int8_t* output, int inc, int in
             max_ = line0[0];
         }
         index = block * 32;
-        for(int j = index; j < in_hw; j++)
+        for (int j = index; j < in_hw; j++)
         {
             max_ = arm_max_int8(max_, line0[0]);
             line0++;
@@ -986,9 +985,9 @@ int pooling_kernel_int8_perf_prerun(struct tensor* input, struct tensor* out, st
     if (param->global)
     {
         if (param->pool_method == POOL_AVG)
-            param->funct = ( pooling_kernel_int8_t )avg_global_int8;
+            param->funct = (pooling_kernel_int8_t)avg_global_int8;
         else if (param->pool_method == POOL_MAX)
-            param->funct = ( pooling_kernel_int8_t )max_global_int8;
+            param->funct = (pooling_kernel_int8_t)max_global_int8;
 
         assert(param->funct != NULL);
         return 0;
@@ -1009,9 +1008,9 @@ int pooling_kernel_int8_perf_prerun(struct tensor* input, struct tensor* out, st
         if ((param->pad_h0 == param->pad_w0) && (param->pad_h1 == param->pad_w1))
         {
             if (pool_size == POOL_K2S2)
-                param->funct = ( pooling_kernel_int8_t )max_2x2s2_int8;
+                param->funct = (pooling_kernel_int8_t)max_2x2s2_int8;
             else if (pool_size == POOL_K3S2)
-                param->funct = ( pooling_kernel_int8_t )max_3x3s2_int8;
+                param->funct = (pooling_kernel_int8_t)max_3x3s2_int8;
         }
     }
     /* general avg pooling, k2s2, k2s2p1, k3s2, k3s2p1 */
@@ -1020,9 +1019,9 @@ int pooling_kernel_int8_perf_prerun(struct tensor* input, struct tensor* out, st
         if ((param->pad_h0 == param->pad_w0) && (param->pad_h1 == param->pad_w1))
         {
             if (pool_size == POOL_K2S2)
-                param->funct = ( pooling_kernel_int8_t )avg_2x2s2_int8;
+                param->funct = (pooling_kernel_int8_t)avg_2x2s2_int8;
             else if (pool_size == POOL_K3S2)
-                param->funct = ( pooling_kernel_int8_t )avg_3x3s2_int8;
+                param->funct = (pooling_kernel_int8_t)avg_3x3s2_int8;
         }
     }
 
@@ -1079,7 +1078,7 @@ int pooling_kernel_int8_perf_run(struct tensor* input, struct tensor* output, st
         if (param->input_pad != NULL)
         {
             pad_0_align_3D_int8((int8_t*)param->input_pad + n * c * in_h_pad * in_w_pad, (int8_t*)input_frame,
-                        in_h_origin, in_w_origin, in_h_pad, in_w_pad, c, pad_h0, pad_w0);
+                                in_h_origin, in_w_origin, in_h_pad, in_w_pad, c, pad_h0, pad_w0);
         }
 
 #pragma omp parallel for num_threads(num_thread)
diff --git a/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c b/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c
index fa1540cb9..e30c84c7e 100644
--- a/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c
+++ b/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c
@@ -44,7 +44,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor;
     struct tensor* output_tensor;
 
-    struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem;
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref.h b/source/device/cpu/op/pooling/pooling_kernel_ref.h
index f835433e1..43c471415 100644
--- a/source/device/cpu/op/pooling/pooling_kernel_ref.h
+++ b/source/device/cpu/op/pooling/pooling_kernel_ref.h
@@ -31,17 +31,16 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int ref_pooling_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
-                           struct pool_param* pool_param, int num_thread);
+                     struct pool_param* pool_param, int num_thread);
 
 int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor,
-                           struct pool_param* pool_param, int num_thread);
+                     struct pool_param* pool_param, int num_thread);
 
 int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
-                           struct pool_param* pool_param, int num_thread);
+                      struct pool_param* pool_param, int num_thread);
 
 int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor,
-                           struct pool_param* pool_param, int num_thread);
+                     struct pool_param* pool_param, int num_thread);
 
 #endif
diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref_fp16.c b/source/device/cpu/op/pooling/pooling_kernel_ref_fp16.c
index 141bea3ce..31a694e8e 100644
--- a/source/device/cpu/op/pooling/pooling_kernel_ref_fp16.c
+++ b/source/device/cpu/op/pooling/pooling_kernel_ref_fp16.c
@@ -41,7 +41,6 @@
 #define HCL_POOL_MAX 0 /* Max pooling     */
 #define HCL_POOL_AVG 1 /* Average pooling */
 
-
 #if MACOS
 
 #else
@@ -49,11 +48,11 @@ static inline void calc_sum_fp16(const fp16_t* input, fp16_t* sum, int layout, i
                                  int start_h, int start_w, int end_h, int end_w)
 {
     float sum_f = 0.0f;
-    for(int i = start_h; i < end_h; i++)
+    for (int i = start_h; i < end_h; i++)
     {
-        for(int j = start_w; j < end_w; j++)
+        for (int j = start_w; j < end_w; j++)
         {
-            if(layout == 0)
+            if (layout == 0)
                 sum_f += fp16_to_fp32(input[cur_ch * h * w + i * w + j]);
             else
                 sum_f += fp16_to_fp32(input[i * w * c + j * c + cur_ch]);
@@ -67,15 +66,15 @@ static inline void calc_max_fp16(const fp16_t* input, fp16_t* max, int layout, i
 {
     float max_f = 0.0f;
     float tmp = 0.0f;
-    if(layout == 0)
+    if (layout == 0)
         max_f = fp16_to_fp32(input[cur_ch * h * w + start_h * w + start_w]);
     else
         max_f = fp16_to_fp32(input[start_h * w * c + start_w * c + cur_ch]);
-    for(int i = start_h; i < end_h; i++)
+    for (int i = start_h; i < end_h; i++)
     {
-        for(int j = start_w; j < end_w; j++)
+        for (int j = start_w; j < end_w; j++)
         {
-            if(layout == 0)
+            if (layout == 0)
                 tmp = fp16_to_fp32(input[cur_ch * h * w + i * w + j]);
             else
                 tmp = fp16_to_fp32(input[i * w * c + j * c + cur_ch]);
@@ -89,7 +88,7 @@ static inline void calc_max_fp16(const fp16_t* input, fp16_t* max, int layout, i
 #endif
 
 int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor,
-                           struct pool_param* pool_param, int num_thread)
+                     struct pool_param* pool_param, int num_thread)
 {
     int layout = input_tensor->layout;
     int type = input_tensor->data_type;
@@ -122,27 +121,27 @@ int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor,
     fp16_t* input = (fp16_t*)input_tensor->data;
     fp16_t* output = (fp16_t*)output_tensor->data;
 
-    for(int n = 0; n < batch; n++)
+    for (int n = 0; n < batch; n++)
     {
         const fp16_t* input_cur = input + n * input_chw;
-        for(int c = 0; c < channel; c++)
+        for (int c = 0; c < channel; c++)
         {
-            for(int ph = 0; ph < out_h; ph++)
+            for (int ph = 0; ph < out_h; ph++)
             {
-                for(int pw = 0; pw < out_w; pw++)
+                for (int pw = 0; pw < out_w; pw++)
                 {
                     int pool_size = 1;
                     int offset = 0;
                     int h_start = ph * stride_h - pad_h;
                     int h_end = h_start + kernel_h;
-                    if(h_end > in_h + pad_h)
+                    if (h_end > in_h + pad_h)
                         h_end = in_h + pad_h;
                     int w_start = pw * stride_w - pad_w;
                     int w_end = w_start + kernel_w;
-                    if(w_end > in_w + pad_w)
+                    if (w_end > in_w + pad_w)
                         w_end = in_w + pad_w;
 
-                    if(caffe_flavor)
+                    if (caffe_flavor)
                         pool_size = (h_end - h_start) * (w_end - w_start);
 
                     h_start = h_start > 0 ? h_start : 0;
@@ -150,25 +149,25 @@ int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor,
                     h_end = h_end < in_h ? h_end : in_h;
                     w_end = w_end < in_w ? w_end : in_w;
 
-                    if(!caffe_flavor)
+                    if (!caffe_flavor)
                         pool_size = (h_end - h_start) * (w_end - w_start);
-                    if(layout == 0)    // nchw
+                    if (layout == 0) // nchw
                         offset = n * output_chw + c * out_h * out_w + ph * out_w + pw;
                     else
                         offset = n * output_chw + ph * out_w * channel + pw * channel + c;
 
-                    if(method == 0)
+                    if (method == 0)
                     {
                         fp16_t max;
                         calc_max_fp16(input_cur, &max, layout, channel, in_h, in_w,
-                                        c, h_start, w_start, h_end, w_end);
+                                      c, h_start, w_start, h_end, w_end);
                         output[offset] = max;
                     }
-                    else if(method == 1)
+                    else if (method == 1)
                     {
                         fp16_t sum;
                         calc_sum_fp16(input_cur, &sum, layout, channel, in_h, in_w,
-                                        c, h_start, w_start, h_end, w_end);
+                                      c, h_start, w_start, h_end, w_end);
                         output[offset] = fp32_to_fp16(fp16_to_fp32(sum) / pool_size);
                     }
                     else
@@ -176,7 +175,6 @@ int ref_pooling_fp16(struct tensor* input_tensor, struct tensor* output_tensor,
                 }
             }
         }
-
     }
 #endif
 
diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref_fp32.c b/source/device/cpu/op/pooling/pooling_kernel_ref_fp32.c
index c78e40bc3..05f499ddf 100644
--- a/source/device/cpu/op/pooling/pooling_kernel_ref_fp32.c
+++ b/source/device/cpu/op/pooling/pooling_kernel_ref_fp32.c
@@ -41,7 +41,6 @@
 #define HCL_POOL_MAX 0 /* Max pooling     */
 #define HCL_POOL_AVG 1 /* Average pooling */
 
-
 static inline float calc_sum_fp32(const float* input, int layout, int c, int h, int w, int cur_ch, int start_h,
                                   int start_w, int end_h, int end_w)
 {
@@ -78,7 +77,7 @@ static inline float calc_max_fp32(const float* input, int layout, int c, int h,
 }
 
 int ref_pooling_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
-                           struct pool_param* pool_param, int num_thread)
+                     struct pool_param* pool_param, int num_thread)
 {
     int layout = input_tensor->layout;
     int type = input_tensor->data_type;
@@ -105,7 +104,6 @@ int ref_pooling_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
     int caffe_flavor = pool_param->caffe_flavor;
     int method = pool_param->pool_method;
 
-
     float* input = (float*)input_tensor->data;
     float* output = (float*)output_tensor->data;
 
@@ -141,19 +139,19 @@ int ref_pooling_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
 
                     if (!caffe_flavor)
                         pool_size = (h_end - h_start) * (w_end - w_start);
-                        
+
                     offset = n * output_chw + c * out_h * out_w + ph * out_w + pw;
 
                     if (method == HCL_POOL_MAX)
                     {
                         float max = calc_max_fp32(input_cur, layout, channel, in_h, in_w, c, h_start, w_start,
-                                                    h_end, w_end);
+                                                  h_end, w_end);
                         output[offset] = max;
                     }
                     else if (method == HCL_POOL_AVG)
                     {
                         float sum = calc_sum_fp32(input_cur, layout, channel, in_h, in_w, c, h_start, w_start,
-                                                    h_end, w_end);
+                                                  h_end, w_end);
                         output[offset] = sum / pool_size;
                     }
                     else
diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref_int8.c b/source/device/cpu/op/pooling/pooling_kernel_ref_int8.c
index f33a590d5..1ab3a14cd 100644
--- a/source/device/cpu/op/pooling/pooling_kernel_ref_int8.c
+++ b/source/device/cpu/op/pooling/pooling_kernel_ref_int8.c
@@ -41,15 +41,14 @@
 #define HCL_POOL_MAX 0 /* Max pooling     */
 #define HCL_POOL_AVG 1 /* Average pooling */
 
-
 static inline int calc_sum_int8(const int8_t* input, int layout, int c, int h, int w, int cur_ch, int start_h,
                                 int start_w, int end_h, int end_w)
 {
     int sum = 0;
-    for(int i = start_h; i < end_h; i++)
-        for(int j = start_w; j < end_w; j++)
+    for (int i = start_h; i < end_h; i++)
+        for (int j = start_w; j < end_w; j++)
         {
-            if(layout == 0)
+            if (layout == 0)
                 sum += input[cur_ch * h * w + i * w + j];
             else
                 sum += input[i * w * c + j * c + cur_ch];
@@ -62,16 +61,16 @@ static inline int8_t calc_max_int8(const int8_t* input, int layout, int c, int h
                                    int start_w, int end_h, int end_w)
 {
     int8_t max = 0;
-    if(layout == 0)
+    if (layout == 0)
         max = input[cur_ch * h * w + start_h * w + start_w];
     else
         max = input[start_h * w * c + start_w * c + cur_ch];
 
     int8_t tmp = 0;
-    for(int i = start_h; i < end_h; i++)
-        for(int j = start_w; j < end_w; j++)
+    for (int i = start_h; i < end_h; i++)
+        for (int j = start_w; j < end_w; j++)
         {
-            if(layout == 0)
+            if (layout == 0)
                 tmp = input[cur_ch * h * w + i * w + j];
             else
                 tmp = input[i * w * c + j * c + cur_ch];
@@ -83,7 +82,7 @@ static inline int8_t calc_max_int8(const int8_t* input, int layout, int c, int h
 }
 
 int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor,
-                           struct pool_param* pool_param, int num_thread)
+                     struct pool_param* pool_param, int num_thread)
 {
     int layout = input_tensor->layout;
     int type = input_tensor->data_type;
@@ -110,8 +109,8 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor,
     int caffe_flavor = pool_param->caffe_flavor;
     int method = pool_param->pool_method;
 
-    int8_t* input_int8 = ( int8_t* )input_tensor->data;
-    int8_t* output_int8 = ( int8_t* )output_tensor->data;
+    int8_t* input_int8 = (int8_t*)input_tensor->data;
+    int8_t* output_int8 = (int8_t*)output_tensor->data;
 
     float input_scale = input_tensor->scale;
     float output_scale = output_tensor->scale;
@@ -119,7 +118,7 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor,
 
     for (int n = 0; n < batch; n++)
     {
-        const int8_t * input_cur = input_int8 + n * input_chw;
+        const int8_t* input_cur = input_int8 + n * input_chw;
         for (int c = 0; c < channel; c++)
         {
             for (int ph = 0; ph < out_h; ph++)
@@ -149,7 +148,7 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor,
 
                     if (!caffe_flavor)
                         pool_size = (h_end - h_start) * (w_end - w_start);
-                    if (layout == TENGINE_LAYOUT_NCHW)    // nchw
+                    if (layout == TENGINE_LAYOUT_NCHW) // nchw
                         offset = n * output_chw + c * out_h * out_w + ph * out_w + pw;
                     else
                         offset = n * output_chw + ph * out_w * channel + pw * channel + c;
@@ -157,9 +156,9 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor,
                     if (method == HCL_POOL_MAX)
                     {
                         int8_t max = calc_max_int8(input_cur, layout, channel, in_h, in_w, c, h_start, w_start,
-                                                    h_end, w_end);
+                                                   h_end, w_end);
 
-                        int32_t data_i32 = round((float )max * requant_scale);
+                        int32_t data_i32 = round((float)max * requant_scale);
                         if (data_i32 > 127)
                             data_i32 = 127;
                         else if (data_i32 < -127)
@@ -172,7 +171,7 @@ int ref_pooling_int8(struct tensor* input_tensor, struct tensor* output_tensor,
                                                         h_end, w_end);
                         float sum_fp32 = sum_i32 * input_scale;
                         sum_fp32 = sum_fp32 / (float)pool_size;
-                        int32_t data_i32 = round((float )sum_fp32 / output_scale);
+                        int32_t data_i32 = round((float)sum_fp32 / output_scale);
                         if (data_i32 > 127)
                             data_i32 = 127;
                         else if (data_i32 < -127)
diff --git a/source/device/cpu/op/pooling/pooling_kernel_ref_uint8.c b/source/device/cpu/op/pooling/pooling_kernel_ref_uint8.c
index effd0c6ad..54e6b8c68 100644
--- a/source/device/cpu/op/pooling/pooling_kernel_ref_uint8.c
+++ b/source/device/cpu/op/pooling/pooling_kernel_ref_uint8.c
@@ -41,7 +41,6 @@
 #define HCL_POOL_MAX 0 /* Max pooling     */
 #define HCL_POOL_AVG 1 /* Average pooling */
 
-
 static inline float calc_sum_fp32(const float* input, int layout, int c, int h, int w, int cur_ch, int start_h,
                                   int start_w, int end_h, int end_w)
 {
@@ -90,7 +89,7 @@ static inline float calc_max_fp32(const float* input, int layout, int c, int h,
 }
 
 int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
-                           struct pool_param* pool_param, int num_thread)
+                      struct pool_param* pool_param, int num_thread)
 {
     int layout = input_tensor->layout;
     int type = input_tensor->data_type;
@@ -117,8 +116,8 @@ int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
     int caffe_flavor = pool_param->caffe_flavor;
     int method = pool_param->pool_method;
 
-    uint8_t* input_uint8 = ( uint8_t* )input_tensor->data;
-    uint8_t* output_uint8 = ( uint8_t* )output_tensor->data;
+    uint8_t* input_uint8 = (uint8_t*)input_tensor->data;
+    uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
 
     float input_scale = input_tensor->scale;
     float output_scale = output_tensor->scale;
@@ -126,8 +125,8 @@ int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
     int output_zero = output_tensor->zero_point;
 
     /* input dequant */
-    float* input_fp32 = ( float* )sys_malloc(input_tensor->elem_num * sizeof(float));
-    float* output_fp32 = ( float* )sys_malloc(output_tensor->elem_num * sizeof(float));
+    float* input_fp32 = (float*)sys_malloc(input_tensor->elem_num * sizeof(float));
+    float* output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float));
 
     for (int i = 0; i < input_tensor->elem_num; i++)
         input_fp32[i] = (input_uint8[i] - input_zero) * input_scale;
@@ -167,7 +166,7 @@ int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
 
                     if (!caffe_flavor)
                         pool_size = (h_end - h_start) * (w_end - w_start);
-                    if (layout == TENGINE_LAYOUT_NCHW)    // nchw
+                    if (layout == TENGINE_LAYOUT_NCHW) // nchw
                         offset = n * output_chw + c * out_h * out_w + ph * out_w + pw;
                     else
                         offset = n * output_chw + ph * out_w * channel + pw * channel + c;
@@ -175,13 +174,13 @@ int ref_pooling_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
                     if (method == HCL_POOL_MAX)
                     {
                         float max = calc_max_fp32(input_cur, layout, channel, in_h, in_w, c, h_start, w_start,
-                                                    h_end, w_end);
+                                                  h_end, w_end);
                         output[offset] = max;
                     }
                     else if (method == HCL_POOL_AVG)
                     {
                         float sum = calc_sum_fp32(input_cur, layout, channel, in_h, in_w, c, h_start, w_start,
-                                                    h_end, w_end);
+                                                  h_end, w_end);
                         output[offset] = sum / pool_size;
                     }
                     else
diff --git a/source/device/cpu/op/pooling/pooling_ref.c b/source/device/cpu/op/pooling/pooling_ref.c
index 9261a6eec..df8ecb6a2 100644
--- a/source/device/cpu/op/pooling/pooling_ref.c
+++ b/source/device/cpu/op/pooling/pooling_ref.c
@@ -38,7 +38,6 @@
 
 #include "pooling_kernel_ref.h"
 
-
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -46,35 +45,34 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem;
 
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_pooling_fp32(input_tensor, output_tensor, pool_param, exec_graph->num_thread);
     else if (input_tensor->data_type == TENGINE_DT_FP16)
-        #if MACOS
+#if MACOS
         TLOG_ERR("FP16 not support mac os");
-        #else
+#else
         ret = ref_pooling_fp16(input_tensor, output_tensor, pool_param, exec_graph->num_thread);
-        #endif
+#endif
     else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_pooling_uint8(input_tensor, output_tensor, pool_param, exec_graph->num_thread);
     else if (input_tensor->data_type == TENGINE_DT_INT8)
         ret = ref_pooling_int8(input_tensor, output_tensor, pool_param, exec_graph->num_thread);
     else
-        TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);    
+        TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);
 
     return 0;
 }
 
-
 static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem;
 
     int ret = 0;
 
@@ -84,9 +82,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     int input_w = input_tensor->dims[3];
     int output_h, output_w;
 
-    if (pool_param->kernel_h == input_h && pool_param->kernel_w == input_w &&
-        pool_param->pad_w0 == 0 && pool_param->pad_w1 == 0 &&
-        pool_param->pad_h0 == 0 && pool_param->pad_h1 == 0)
+    if (pool_param->kernel_h == input_h && pool_param->kernel_w == input_w && pool_param->pad_w0 == 0 && pool_param->pad_w1 == 0 && pool_param->pad_h0 == 0 && pool_param->pad_h1 == 0)
     {
         pool_param->global = 1;
     }
@@ -131,8 +127,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
     int dims[4];
 
-    if (output_tensor->dims[1] != channel || output_tensor->dims[2] != output_h ||
-        output_tensor->dims[3] != output_w)
+    if (output_tensor->dims[1] != channel || output_tensor->dims[2] != output_h || output_tensor->dims[3] != output_w)
     {
         dims[0] = batch;
         dims[1] = channel;
@@ -144,31 +139,26 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     return ret;
 }
 
-
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
 }
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* dev)
 {
     return 0;
 }
 
-
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* dev)
 {
     return 0;
 }
 
-
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
     return OPS_SCORE_CANDO;
 }
 
-
 static struct node_ops hcl_node_ops = {.prerun = NULL,
                                        .run = run,
                                        .reshape = reshape,
@@ -177,13 +167,11 @@ static struct node_ops hcl_node_ops = {.prerun = NULL,
                                        .release_node = release_node,
                                        .score = score};
 
-
 int register_pooling_ref_op()
 {
     return register_builtin_node_ops(OP_POOL, &hcl_node_ops);
 }
 
-
 int unregister_pooling_ref_op()
 {
     unregister_builtin_node_ops(OP_POOL, &hcl_node_ops);
diff --git a/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c b/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c
index 2c96992da..9012a5686 100644
--- a/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c
+++ b/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -48,8 +47,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] ||
-        input_tensor->dims[3] != output_tensor->dims[3])
+    if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3])
         ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num);
 
     return ret;
diff --git a/source/device/cpu/op/prelu/cortex_a/prelu_kernel_arm.c b/source/device/cpu/op/prelu/cortex_a/prelu_kernel_arm.c
index feb139cfc..33a6c6817 100644
--- a/source/device/cpu/op/prelu/cortex_a/prelu_kernel_arm.c
+++ b/source/device/cpu/op/prelu/cortex_a/prelu_kernel_arm.c
@@ -26,7 +26,6 @@
 
 #include <arm_neon.h>
 
-
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
diff --git a/source/device/cpu/op/prelu/prelu_ref.c b/source/device/cpu/op/prelu/prelu_ref.c
index d338122be..6dd8e4151 100644
--- a/source/device/cpu/op/prelu/prelu_ref.c
+++ b/source/device/cpu/op/prelu/prelu_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -155,7 +154,7 @@ static int ref_prelu_fp32(struct tensor* input_tensor, struct tensor* output_ten
         }
     }
 
-    return 0; 
+    return 0;
 }
 
 static int ref_prelu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct tensor* slope_tensor)
@@ -176,13 +175,13 @@ static int ref_prelu_uint8(struct tensor* input_tensor, struct tensor* output_te
     int output_size = output_tensor->elem_num;
     int slope_size = slope_tensor->elem_num;
 
-    float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float));
-    float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float));
-    float* slope_fp32 = ( float* )sys_malloc(slope_size * sizeof(float));
+    float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float));
+    float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float));
+    float* slope_fp32 = (float*)sys_malloc(slope_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        input_fp32[i] = (( float )input_data[i] - ( float )input_zero) * input_scale;
+        input_fp32[i] = ((float)input_data[i] - (float)input_zero) * input_scale;
     }
     for (int i = 0; i < slope_size; i++)
     {
@@ -333,17 +332,17 @@ static int ref_prelu_int8(struct tensor* input_tensor, struct tensor* output_ten
     int output_size = output_tensor->elem_num;
     int slope_size = slope_tensor->elem_num;
 
-    float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float));
-    float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float));
-    float* slope_fp32 = ( float* )sys_malloc(slope_size * sizeof(float));
+    float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float));
+    float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float));
+    float* slope_fp32 = (float*)sys_malloc(slope_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        input_fp32[i] = ( float )data[i] * input_scale;
+        input_fp32[i] = (float)data[i] * input_scale;
     }
     for (int i = 0; i < slope_size; i++)
     {
-        slope_fp32[i] = ( float )slope[i] * slope_scale;
+        slope_fp32[i] = (float)slope[i] * slope_scale;
     }
 
     int offset = 0;
@@ -408,8 +407,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] ||
-        input_tensor->dims[3] != output_tensor->dims[3])
+    if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3])
         ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num);
 
     return ret;
@@ -430,9 +428,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_prelu_fp32(input_tensor, output_tensor, slope_tensor);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_prelu_uint8(input_tensor, output_tensor, slope_tensor);
-    else if(input_tensor->data_type == TENGINE_DT_INT8)
+    else if (input_tensor->data_type == TENGINE_DT_INT8)
         ret = ref_prelu_int8(input_tensor, output_tensor, slope_tensor);
     else
         TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);
diff --git a/source/device/cpu/op/priorbox/priorbox_ref.c b/source/device/cpu/op/priorbox/priorbox_ref.c
index 99f3ccc05..39df5ec09 100644
--- a/source/device/cpu/op/priorbox/priorbox_ref.c
+++ b/source/device/cpu/op/priorbox/priorbox_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 #define T_MAX(a, b) ((a) > (b) ? (a) : (b))
 #define T_MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -55,15 +54,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
     struct tensor* featmap_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    struct tensor* data_tensor    = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
-    struct tensor* output_tensor  = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    priorbox_param_t* param = ( priorbox_param_t* )(ir_node->op.param_mem);
+    struct tensor* data_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+    priorbox_param_t* param = (priorbox_param_t*)(ir_node->op.param_mem);
 
     float* output_fp32 = NULL;
     if (output_tensor->data_type == TENGINE_DT_FP32)
-        output_fp32 = ( float* )output_tensor->data;
+        output_fp32 = (float*)output_tensor->data;
     else if (output_tensor->data_type == TENGINE_DT_UINT8 || output_tensor->data_type == TENGINE_DT_INT8)
-        output_fp32 = ( float* )sys_malloc(output_tensor->elem_num * sizeof(float ));
+        output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float));
 
     const int data_height = data_tensor->dims[2];
     const int data_width = data_tensor->dims[3];
@@ -83,8 +82,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     float step_w, step_h;
     if (param->step_h == 0 || param->step_w == 0)
     {
-        step_w = ( float )(image_w) / feat_width;
-        step_h = ( float )(image_h) / feat_height;
+        step_w = (float)(image_w) / feat_width;
+        step_h = (float)(image_h) / feat_height;
     }
     else
     {
@@ -105,7 +104,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
             float center_x = (w + offset_) * step_w;
             float center_y = (h + offset_) * step_h;
             float box_width, box_height;
-            for (int s = 0; s < ( int )param->min_size_num; ++s)
+            for (int s = 0; s < (int)param->min_size_num; ++s)
             {
                 int min_size_ = param->min_size[s];
                 // first prior: aspect_ratio = 1, size = min_size
@@ -130,7 +129,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
                 }
 
                 // rest of priors
-                for (int r = 0; r < ( int )param->aspect_ratio_size; ++r)
+                for (int r = 0; r < (int)param->aspect_ratio_size; ++r)
                 {
                     float ar = param->aspect_ratio[r];
 
@@ -179,7 +178,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     {
         uint8_t* output_org = (uint8_t*)output_tensor->data;
 
-        for (int i=0; i<output_tensor->elem_num; i++)
+        for (int i = 0; i < output_tensor->elem_num; i++)
         {
             int udata = (int)(output_fp32[i] / output_tensor->scale + output_tensor->zero_point);
             if (udata > 255)
@@ -197,7 +196,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     {
         int8_t* output_org = (int8_t*)output_tensor->data;
 
-        for (int i=0; i<output_tensor->elem_num; i++)
+        for (int i = 0; i < output_tensor->elem_num; i++)
         {
             int data_i32 = round(output_fp32[i] / output_tensor->scale);
             if (data_i32 > 127)
diff --git a/source/device/cpu/op/psroipooling/psroipooling_ref.c b/source/device/cpu/op/psroipooling/psroipooling_ref.c
index e7ff48fb9..9039a3f8d 100644
--- a/source/device/cpu/op/psroipooling/psroipooling_ref.c
+++ b/source/device/cpu/op/psroipooling/psroipooling_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 #define T_MAX(a, b) ((a) > (b) ? (a) : (b))
 #define T_MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -68,8 +67,8 @@ static int ref_psroipooling_fp32(struct tensor* featmap_tensor, struct tensor* r
         int roi_w = T_MAX(roi_x1 - roi_x0, 0);
         int roi_h = T_MAX(roi_y1 - roi_y0, 0);
 
-        float bin_w = ( float )roi_w / ( float )out_w;
-        float bin_h = ( float )roi_h / ( float )out_h;
+        float bin_w = (float)roi_w / (float)out_w;
+        float bin_h = (float)roi_h / (float)out_h;
 
         for (int c = 0; c < output_dim; c++)
         {
@@ -80,10 +79,10 @@ static int ref_psroipooling_fp32(struct tensor* featmap_tensor, struct tensor* r
                 {
                     float* inptr = featmap + (c * out_h + h) * out_w + w;
 
-                    int hstart = floor(roi_y0 + ( float )( h )*bin_h);
-                    int wstart = floor(roi_x0 + ( float )( w )*bin_w);
-                    int hend = ceil(roi_y0 + ( float )(h + 1) * bin_h);
-                    int wend = ceil(roi_x0 + ( float )(w + 1) * bin_w);
+                    int hstart = floor(roi_y0 + (float)(h)*bin_h);
+                    int wstart = floor(roi_x0 + (float)(w)*bin_w);
+                    int hend = ceil(roi_y0 + (float)(h + 1) * bin_h);
+                    int wend = ceil(roi_x0 + (float)(w + 1) * bin_w);
 
                     hstart = T_MIN(T_MAX(hstart, 0), in_h);
                     wstart = T_MIN(T_MAX(wstart, 0), in_w);
@@ -102,7 +101,7 @@ static int ref_psroipooling_fp32(struct tensor* featmap_tensor, struct tensor* r
                             sum += inptr[index];
                         }
                     }
-                    outptr[w] = is_empty ? 0.f : (sum / ( float )area);
+                    outptr[w] = is_empty ? 0.f : (sum / (float)area);
                 }
                 outptr += out_w;
             }
@@ -133,7 +132,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     featmap_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     roi_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct psroipooling_param* psroipooling_param = ( struct psroipooling_param* )ir_node->op.param_mem;
+    struct psroipooling_param* psroipooling_param = (struct psroipooling_param*)ir_node->op.param_mem;
 
     ref_psroipooling_fp32(featmap_tensor, roi_tensor, output_tensor, psroipooling_param, exec_graph->num_thread);
 
diff --git a/source/device/cpu/op/reciprocal/reciprocal_ref.c b/source/device/cpu/op/reciprocal/reciprocal_ref.c
index 95903a2f4..c770bb657 100644
--- a/source/device/cpu/op/reciprocal/reciprocal_ref.c
+++ b/source/device/cpu/op/reciprocal/reciprocal_ref.c
@@ -72,8 +72,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-	int ret = -1;
-    if(input_tensor->data_type == TENGINE_DT_FP32)
+    int ret = -1;
+    if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_reciprocal_fp32(input_tensor, output_tensor, exec_graph->num_thread);
     else
         printf("Input data type %d not to be supported.\n", input_tensor->data_type);
@@ -98,13 +98,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops hcl_node_ops = {
-        .prerun = NULL,
-        .run = run,
-        .reshape = reshape,
-        .postrun = NULL,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score};
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score};
 
 int register_reciprocal_ref_op()
 {
diff --git a/source/device/cpu/op/reducel2/reducel2_ref.c b/source/device/cpu/op/reducel2/reducel2_ref.c
index 3108faf76..e92f98caf 100644
--- a/source/device/cpu/op/reducel2/reducel2_ref.c
+++ b/source/device/cpu/op/reducel2/reducel2_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 struct ref_reducel2_param
 {
     int axis;
@@ -89,10 +88,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct reducel2_param* op_param = ( struct reducel2_param* )ir_node->op.param_mem;
+    struct reducel2_param* op_param = (struct reducel2_param*)ir_node->op.param_mem;
 
-    void* in_data = ( void* )input_tensor->data;
-    void* out_data = ( void* )output_tensor->data;
+    void* in_data = (void*)input_tensor->data;
+    void* out_data = (void*)output_tensor->data;
 
     struct ref_reducel2_param param;
 
diff --git a/source/device/cpu/op/reduction/reduction_kernel_ref.h b/source/device/cpu/op/reduction/reduction_kernel_ref.h
index 4aa4f1ec1..c3459b2da 100644
--- a/source/device/cpu/op/reduction/reduction_kernel_ref.h
+++ b/source/device/cpu/op/reduction/reduction_kernel_ref.h
@@ -30,7 +30,6 @@
 #include <math.h>
 #include <stdio.h>
 
-
 #define FLOAT_MAX 3.4028235E38
 #define FLOAT_MIN -3.4028235E38
 
@@ -148,7 +147,7 @@ struct reduce_param_ref
 };
 
 static int ref_reduce_uint8(uint8_t* data, uint8_t* out_data, int dim0, int dim1, int dim2, int dim3, int out_size,
-                           struct reduce_param_ref* param, int dim_num, int* dims)
+                            struct reduce_param_ref* param, int dim_num, int* dims)
 {
     int offset = 0;
     int param_dim0 = param->param_dim[0];
@@ -164,7 +163,7 @@ static int ref_reduce_uint8(uint8_t* data, uint8_t* out_data, int dim0, int dim1
     {
         if (param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num > 4))
         {
-            if(dim_num == 5)
+            if (dim_num == 5)
             {
                 sum_5d_ax1_uint8(dims, dim_num, data, out_data, in_scale, in_zp, out_scale, out_zp);
             }
@@ -177,7 +176,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
                            struct reduce_param_ref* param, int dim_num, int* dims)
 {
     int offset = 0;
-    float* tmp = ( float* )sys_malloc(sizeof(float) * out_size);
+    float* tmp = (float*)sys_malloc(sizeof(float) * out_size);
     memset(tmp, 0, sizeof(float) * out_size);
     int param_dim0 = param->param_dim[0];
     int param_dim1 = param->param_dim[1];
@@ -186,8 +185,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
 
     if (param->type == 0)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             for (int n = 0; n < dim0; n++)
             {
@@ -205,9 +203,10 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
                 }
             }
         }
-        else if(param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num > 4))
+        else if (param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num > 4))
         {
-            if(dim_num == 5){
+            if (dim_num == 5)
+            {
                 sum_5d_ax1(dims, dim_num, data, tmp);
             }
         }
@@ -215,7 +214,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num <= 4) )
+        else if (param_dim0 == 1 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2 && (dim_num <= 4))
         {
             fprintf(stderr, "wrond dim_num %d \n", dim_num);
             sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp);
@@ -228,83 +227,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             sum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             sum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -314,18 +302,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -335,18 +318,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -356,18 +334,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -381,8 +354,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     // reduce mean
     else if (param->type == 1)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = 0.f;
             for (int n = 0; n < dim0; n++)
@@ -418,83 +390,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             mean_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             mean_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             mean_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             mean_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             mean_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             mean_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             mean_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             mean_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             mean_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             mean_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -504,18 +465,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -525,18 +481,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             mean_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -546,18 +497,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             mean_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -571,8 +517,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     // reduce asum
     else if (param->type == 2)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = 0.f;
             for (int n = 0; n < dim0; n++)
@@ -608,83 +553,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             asum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             asum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -694,18 +628,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -715,18 +644,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -736,18 +660,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -761,8 +680,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     // reduce sqsum
     else if (param->type == 3)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = 0.f;
             for (int n = 0; n < dim0; n++)
@@ -798,83 +716,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             sqsum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             sqsum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             sqsum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             sqsum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -884,18 +791,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -905,18 +807,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             sqsum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -926,18 +823,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             sqsum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -951,8 +843,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     // reduce max
     else if (param->type == 4)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = FLOAT_MIN;
             for (int n = 0; n < dim0; n++)
@@ -989,83 +880,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             max_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             max_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             max_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             max_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             max_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             max_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             max_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             max_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             max_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             max_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1075,18 +955,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1096,18 +971,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             max_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -1117,18 +987,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             max_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -1142,8 +1007,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     // reduce min
     else if (param->type == 5)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = FLOAT_MAX;
             for (int n = 0; n < dim0; n++)
@@ -1180,83 +1044,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             min_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             min_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             min_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             min_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             min_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             min_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             min_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             min_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             min_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             min_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1266,18 +1119,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1287,18 +1135,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             min_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -1308,18 +1151,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             min_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -1333,8 +1171,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     // reduce prod
     else if (param->type == 6)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = 1.f;
             for (int n = 0; n < dim0; n++)
@@ -1370,83 +1207,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             prod_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             prod_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             prod_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             prod_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             prod_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             prod_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             prod_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             prod_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             prod_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             prod_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1456,18 +1282,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1477,18 +1298,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             prod_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -1498,18 +1314,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             prod_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -1523,8 +1334,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     // reduce l1
     else if (param->type == 7)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = 0.f;
             for (int n = 0; n < dim0; n++)
@@ -1560,83 +1370,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             asum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             asum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1646,18 +1445,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1667,18 +1461,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             asum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -1688,18 +1477,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             asum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -1713,8 +1497,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     // reduce l2
     else if (param->type == 8)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = 0.f;
             for (int n = 0; n < dim0; n++)
@@ -1750,83 +1533,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             l2_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             sum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             sum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             sum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             l2_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             sum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             l2_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             sum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             l2_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             sum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1836,18 +1608,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -1857,18 +1624,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             l2_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -1878,18 +1640,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             l2_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -1903,8 +1660,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     // reduce log sum
     else if (param->type == 9)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = 0.f;
             for (int n = 0; n < dim0; n++)
@@ -1940,83 +1696,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             logsum_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             logsum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             logsum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             logsum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             logsum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             logsum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             sum_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             logsum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -2026,18 +1771,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -2047,18 +1787,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             sum_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -2068,18 +1803,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             sum_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -2092,8 +1822,7 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
     }
     else if (param->type == 10)
     {
-        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) ||
-            (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
+        if ((param_dim0 == -2 && param_dim1 == -2 && param_dim2 == -2 && param_dim3 == -2) || (param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2 && param_dim3 == 3))
         {
             float s_tmp = 0.f;
             for (int n = 0; n < dim0; n++)
@@ -2129,83 +1858,72 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
         {
             logsumexp_4d_ax3(dim0, dim1, dim2, dim3, data, tmp);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 0) || (param_dim0 == 0 && param_dim1 == 1)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
             sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
             logsum_3d_ax0(dim1, dim2, dim3, tmp, tmp_01);
 
             free(tmp_01);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
             sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
             logsum_3d_ax1(dim1, dim2, dim3, tmp, tmp_02);
 
             free(tmp_02);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 0)))
         {
             // reduce on axis0
-            float* tmp_03 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_03 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_03, 0, sizeof(float) * dim1 * dim2 * dim3);
             sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_03);
             logsum_3d_ax2(dim1, dim2, dim3, tmp, tmp_03);
             free(tmp_03);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2) || (param_dim0 == 2 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
             sumexp_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
             logsum_3d_ax1(dim0, dim2, dim3, tmp, tmp_12);
 
             free(tmp_12);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 1)))
         {
             // reduce on axis1
-            float* tmp_13 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_13 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_13, 0, sizeof(float) * dim0 * dim2 * dim3);
             sumexp_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_13);
             logsum_3d_ax2(dim0, dim2, dim3, tmp, tmp_13);
 
             free(tmp_13);
         }
-        else if (param_dim2 == -2 && param_dim3 == -2 &&
-                 ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
+        else if (param_dim2 == -2 && param_dim3 == -2 && ((param_dim0 == 2 && param_dim1 == 3) || (param_dim0 == 3 && param_dim1 == 2)))
         {
             // reduce on axis2
-            float* tmp_23 = ( float* )malloc(sizeof(float) * dim0 * dim1 * dim3);
+            float* tmp_23 = (float*)malloc(sizeof(float) * dim0 * dim1 * dim3);
             memset(tmp_23, 0, sizeof(float) * dim0 * dim1 * dim3);
             sumexp_4d_ax2(dim0, dim1, dim2, dim3, data, tmp_23);
             logsum_3d_ax2(dim0, dim1, dim3, tmp, tmp_23);
 
             free(tmp_23);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 0) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_0 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_0 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_0, 0, sizeof(float) * dim2 * dim3);
 
             sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -2215,18 +1933,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_0);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 1 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_01 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_01 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_01, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim2 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim2 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim2 * dim3);
 
             sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_01);
@@ -2236,18 +1949,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_01);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) ||
-                                      (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
+        else if (param_dim3 == -2 && ((param_dim0 == 0 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 0 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 0 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 0) || (param_dim0 == 3 && param_dim1 == 0 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 0)))
         {
             // reduce on axis0
-            float* tmp_02 = ( float* )malloc(sizeof(float) * dim1 * dim2 * dim3);
+            float* tmp_02 = (float*)malloc(sizeof(float) * dim1 * dim2 * dim3);
             memset(tmp_02, 0, sizeof(float) * dim1 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim1 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim1 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim1 * dim3);
 
             sumexp_4d_ax0(dim0, dim1, dim2, dim3, data, tmp_02);
@@ -2257,18 +1965,13 @@ static int ref_reduce_fp32(float* data, float* out_data, int dim0, int dim1, int
             free(tmp_02);
             free(tmp_1);
         }
-        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) ||
-                                      (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) ||
-                                      (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) ||
-                                      (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) ||
-                                      (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) ||
-                                      (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
+        else if (param_dim3 == -2 && ((param_dim0 == 1 && param_dim1 == 2 && param_dim2 == 3) || (param_dim0 == 1 && param_dim1 == 3 && param_dim2 == 2) || (param_dim0 == 2 && param_dim1 == 1 && param_dim2 == 3) || (param_dim0 == 2 && param_dim1 == 3 && param_dim2 == 1) || (param_dim0 == 3 && param_dim1 == 1 && param_dim2 == 2) || (param_dim0 == 3 && param_dim1 == 2 && param_dim2 == 1)))
         {
             // reduce on axis0
-            float* tmp_12 = ( float* )malloc(sizeof(float) * dim0 * dim2 * dim3);
+            float* tmp_12 = (float*)malloc(sizeof(float) * dim0 * dim2 * dim3);
             memset(tmp_12, 0, sizeof(float) * dim0 * dim2 * dim3);
 
-            float* tmp_1 = ( float* )malloc(sizeof(float) * dim0 * dim3);
+            float* tmp_1 = (float*)malloc(sizeof(float) * dim0 * dim3);
             memset(tmp_1, 0, sizeof(float) * dim0 * dim3);
 
             sumexp_4d_ax1(dim0, dim1, dim2, dim3, data, tmp_12);
@@ -2436,11 +2139,14 @@ void sum_5d_ax1(int* dims, int dim_num, float* data, float* tmp)
     int dim2 = dims[2];
     int dim3 = dims[3];
     int dim4 = dims[4];
-    int chw = dim2*dim3*dim4;
-    for(int j = 0; j < dim0; j++){
-        for(int n = 0; n < dim1; n++){
-            for(int size = 0; size < chw; size++){
-                tmp[size] += data[n*chw + size];
+    int chw = dim2 * dim3 * dim4;
+    for (int j = 0; j < dim0; j++)
+    {
+        for (int n = 0; n < dim1; n++)
+        {
+            for (int size = 0; size < chw; size++)
+            {
+                tmp[size] += data[n * chw + size];
             }
         }
     }
@@ -2453,20 +2159,24 @@ void sum_5d_ax1_uint8(int* dims, int dim_num, uint8_t* data, uint8_t* out_data,
     int dim2 = dims[2];
     int dim3 = dims[3];
     int dim4 = dims[4];
-    int chw = dim2*dim3*dim4;
+    int chw = dim2 * dim3 * dim4;
 
-    float* tmp = ( float* )malloc(sizeof(float) * chw);
+    float* tmp = (float*)malloc(sizeof(float) * chw);
     memset(tmp, 0, sizeof(float) * chw);
 
-    for(int j = 0; j < dim0; j++){
-        for(int n = 0; n < dim1; n++){
-            for(int size = 0; size < chw; size++){
-                float tmp_in_data = in_scale * (data[n*chw + size] - in_zp);
+    for (int j = 0; j < dim0; j++)
+    {
+        for (int n = 0; n < dim1; n++)
+        {
+            for (int size = 0; size < chw; size++)
+            {
+                float tmp_in_data = in_scale * (data[n * chw + size] - in_zp);
                 tmp[size] += tmp_in_data;
             }
         }
     }
-    for(int size = 0; size < chw; size++){
+    for (int size = 0; size < chw; size++)
+    {
         int32_t data_i32 = round(tmp[size] / out_scale + out_zp);
         if (data_i32 > 255)
             data_i32 = 255;
@@ -3288,7 +2998,7 @@ void l2_4d_ax0(int dim0, int dim1, int dim2, int dim3, float* data, float* tmp)
         for (int n = 0; n < dim0; n++)
         {
             int offset = n * dim1 * dim2 * dim3 + j;
-            tmp[j] += sqrt((double )data[offset] * data[offset]);
+            tmp[j] += sqrt((double)data[offset] * data[offset]);
         }
     }
 }
@@ -3301,7 +3011,7 @@ void l2_4d_ax1(int dim0, int dim1, int dim2, int dim3, float* data, float* tmp)
             for (int h = 0; h < dim1; h++)
             {
                 int offset = n * dim1 * dim2 * dim3 + h * dim2 * dim3 + cw;
-                tmp[n * dim2 * dim3 + cw] += sqrt((double )data[offset] * data[offset]);
+                tmp[n * dim2 * dim3 + cw] += sqrt((double)data[offset] * data[offset]);
             }
         }
     }
@@ -3317,7 +3027,7 @@ void l2_4d_ax2(int dim0, int dim1, int dim2, int dim3, float* data, float* tmp)
                 for (int w = 0; w < dim2; w++)
                 {
                     int offset = n * dim1 * dim2 * dim3 + h * dim2 * dim3 + w * dim3 + c;
-                    tmp[n * dim1 * dim3 + h * dim3 + c] += sqrt((double )data[offset] * data[offset]);
+                    tmp[n * dim1 * dim3 + h * dim3 + c] += sqrt((double)data[offset] * data[offset]);
                 }
             }
         }
@@ -3334,7 +3044,7 @@ void l2_4d_ax3(int dim0, int dim1, int dim2, int dim3, float* data, float* tmp)
                 for (int c = 0; c < dim3; c++)
                 {
                     int offset = n * dim1 * dim2 * dim3 + h * dim2 * dim3 + w * dim3 + c;
-                    tmp[n * dim1 * dim2 + h * dim2 + w] += sqrt((double )data[offset] * data[offset]);
+                    tmp[n * dim1 * dim2 + h * dim2 + w] += sqrt((double)data[offset] * data[offset]);
                 }
             }
         }
@@ -3347,7 +3057,7 @@ void l2_3d_ax0(int dim1, int dim2, int dim3, float* tmp, float* tmp_01)
         for (int h = 0; h < dim1; h++)
         {
             int index = h * dim2 * dim3 + wc;
-            tmp[wc] += sqrt((double )tmp_01[index] * tmp_01[index]);
+            tmp[wc] += sqrt((double)tmp_01[index] * tmp_01[index]);
         }
     }
 }
@@ -3360,7 +3070,7 @@ void l2_3d_ax1(int dim1, int dim2, int dim3, float* tmp, float* tmp_02)
             for (int w = 0; w < dim2; w++)
             {
                 int index = h * dim2 * dim3 + w * dim3 + c;
-                tmp[h * dim3 + c] += sqrt((double )tmp_02[index] * tmp_02[index]);
+                tmp[h * dim3 + c] += sqrt((double)tmp_02[index] * tmp_02[index]);
             }
         }
     }
@@ -3374,7 +3084,7 @@ void l2_3d_ax2(int dim1, int dim2, int dim3, float* tmp, float* tmp_03)
             for (int c = 0; c < dim3; c++)
             {
                 int index = h * dim2 * dim3 + w * dim3 + c;
-                tmp[h * dim2 + w] += sqrt((double )tmp_03[index] * tmp_03[index]);
+                tmp[h * dim2 + w] += sqrt((double)tmp_03[index] * tmp_03[index]);
             }
         }
     }
@@ -3386,7 +3096,7 @@ void l2_2d_ax0(int dim1, int dim2, float* tmp, float* tmp_0)
         for (int h = 0; h < dim1; h++)
         {
             int index = h * dim2 + w;
-            tmp[w] += sqrt((double )tmp_0[index] * tmp_0[index]);
+            tmp[w] += sqrt((double)tmp_0[index] * tmp_0[index]);
         }
     }
 }
@@ -3397,7 +3107,7 @@ void l2_2d_ax1(int dim1, int dim2, float* tmp, float* tmp_1)
         for (int w = 0; w < dim2; w++)
         {
             int index = h * dim2 + w;
-            tmp[h] += sqrt((double )tmp_1[index] * tmp_1[index]);
+            tmp[h] += sqrt((double)tmp_1[index] * tmp_1[index]);
         }
     }
 }
diff --git a/source/device/cpu/op/reduction/reduction_ref.c b/source/device/cpu/op/reduction/reduction_ref.c
index d4641652a..f3353f326 100644
--- a/source/device/cpu/op/reduction/reduction_ref.c
+++ b/source/device/cpu/op/reduction/reduction_ref.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -61,7 +60,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct reduction_param* reduction_param = ( struct reduction_param* )ir_node->op.param_mem;
+    struct reduction_param* reduction_param = (struct reduction_param*)ir_node->op.param_mem;
     struct reduce_param_ref param;
     int out_tensor_size = 1;
     for (int i = 0; i < output_tensor->dim_num; i++)
@@ -71,8 +70,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int element_size = output_tensor->elem_size;
 
     // int dims[4] = {1, 1, 1, 1};
-    int* dims = (int*)malloc(input_tensor->dim_num*sizeof(int));
-    memset(dims, 0, input_tensor->dim_num*sizeof(int));
+    int* dims = (int*)malloc(input_tensor->dim_num * sizeof(int));
+    memset(dims, 0, input_tensor->dim_num * sizeof(int));
     for (int i = 0; i < input_tensor->dim_num; i++)
     {
         dims[i] = input_tensor->dims[i];
@@ -81,7 +80,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int dim1 = dims[1];
     int dim2 = dims[2];
     int dim3 = dims[3];
-    
 
     param.param_dim[0] = reduction_param->dim_0;
     param.param_dim[1] = reduction_param->dim_1;
@@ -93,8 +91,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int ret = 0;
     if (input_tensor->data_type == TENGINE_DT_FP32)
     {
-        ret = ref_reduce_fp32(( float* )input_tensor->data, ( float* )output_tensor->data, dim0, dim1, dim2, dim3,
-                                out_tensor_size, &param, in_dim_num, dims);
+        ret = ref_reduce_fp32((float*)input_tensor->data, (float*)output_tensor->data, dim0, dim1, dim2, dim3,
+                              out_tensor_size, &param, in_dim_num, dims);
     }
     else if (input_tensor->data_type == TENGINE_DT_UINT8)
     {
@@ -102,8 +100,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         param.output_scale = output_tensor->scale;
         param.input_zp = input_tensor->zero_point;
         param.output_zp = output_tensor->zero_point;
-        ret = ref_reduce_uint8(( uint8_t* )input_tensor->data, ( uint8_t* )output_tensor->data, dim0, dim1, dim2, dim3,
-                                out_tensor_size, &param, in_dim_num, dims);
+        ret = ref_reduce_uint8((uint8_t*)input_tensor->data, (uint8_t*)output_tensor->data, dim0, dim1, dim2, dim3,
+                               out_tensor_size, &param, in_dim_num, dims);
     }
     free(dims);
 
diff --git a/source/device/cpu/op/region/region_ref.c b/source/device/cpu/op/region/region_ref.c
index 1b02e8178..3bb0b37a1 100644
--- a/source/device/cpu/op/region/region_ref.c
+++ b/source/device/cpu/op/region/region_ref.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 static int entry_index(int batch, int location, int entry, int hw, int chw, int classes)
 {
     int coords = 4;
@@ -157,7 +156,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct region_param* region_param = ( struct region_param* )ir_node->op.param_mem;
+    struct region_param* region_param = (struct region_param*)ir_node->op.param_mem;
 
     ref_region_fp32(input_tensor, output_tensor, region_param, exec_graph->num_thread);
 
diff --git a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c
index 2a49d6fbf..0f885ba8b 100644
--- a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c
+++ b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c
@@ -30,7 +30,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -51,7 +50,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct relu_param* relu_param = ( struct relu_param* )ir_node->op.param_mem;
+    struct relu_param* relu_param = (struct relu_param*)ir_node->op.param_mem;
 
     perf_relu_fp32(input_tensor, output_tensor, relu_param->negative_slope, exec_graph->num_thread);
 
diff --git a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.h b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.h
index 35296a1d0..f1ff0f56a 100644
--- a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.h
+++ b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.h
@@ -33,7 +33,6 @@
 
 #include <arm_neon.h>
 
-
 static int perf_relu_fp32(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope,
                           int num_thread)
 {
@@ -66,7 +65,7 @@ static int perf_relu_fp32(struct tensor* input_tensor, struct tensor* output_ten
                 int remain = size - (nn << 2);
 #else
                 int remain = size;
-#endif    // __ARM_NEON
+#endif // __ARM_NEON
 
 #if __ARM_NEON
                 float32x4_t _zero = vdupq_n_f32(0.f);
@@ -110,7 +109,7 @@ static int perf_relu_fp32(struct tensor* input_tensor, struct tensor* output_ten
                 int remain = size - (nn << 2);
 #else
                 int remain = size;
-#endif    // __ARM_NEON
+#endif // __ARM_NEON
 
 #if __ARM_NEON
                 float32x4_t _zero = vdupq_n_f32(0.f);
diff --git a/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.c b/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.c
index f499251ca..026206ef6 100644
--- a/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.c
+++ b/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.c
@@ -30,12 +30,11 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
-
 static inline int relu_kernel(const int i, const int id, const void* data, const float* input, float* output,
                               const float slope)
 {
     float32x4_t _zero = vdupq_n_f32(0.f);
-    int step = (( int* )data)[0];
+    int step = ((int*)data)[0];
     const float* cur_input = input + id * step;
     float* cur_output = output + id * step;
     if (slope == 0)
@@ -80,8 +79,8 @@ static inline int relu_kernel(const int i, const int id, const void* data, const
 int relu_arm_run(struct tensor* output_tensor, struct tensor* input_tensor, struct relu_param* relu_param,
                  int num_thread)
 {
-    float* data = ( float* )input_tensor->data;
-    float* out_data = ( float* )output_tensor->data;
+    float* data = (float*)input_tensor->data;
+    float* out_data = (float*)output_tensor->data;
     float negativeslope = relu_param->negative_slope;
 
     int chan_num = input_tensor->dims[0] * input_tensor->dims[1];
diff --git a/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.h b/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.h
index b4dd59ca8..25439d5d9 100644
--- a/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.h
+++ b/source/device/cpu/op/relu/cortex-a/relu_kernel_arm.h
@@ -31,7 +31,6 @@
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int relu_arm_run(struct tensor* output_tensor, struct tensor* input_tensor, struct relu_param* relu_param,
                  int num_thread);
 
diff --git a/source/device/cpu/op/relu/cortex-m/relu_cmsis.c b/source/device/cpu/op/relu/cortex-m/relu_cmsis.c
index 2e19f4260..72d506512 100644
--- a/source/device/cpu/op/relu/cortex-m/relu_cmsis.c
+++ b/source/device/cpu/op/relu/cortex-m/relu_cmsis.c
@@ -37,7 +37,6 @@
 
 #include "arm_math.h"
 
-
 /**
  * @brief Q7 RELU function
  * @param[in,out]   data        pointer to input
diff --git a/source/device/cpu/op/relu/relu_kernel_ref.h b/source/device/cpu/op/relu/relu_kernel_ref.h
index e4927d200..981b0cb53 100644
--- a/source/device/cpu/op/relu/relu_kernel_ref.h
+++ b/source/device/cpu/op/relu/relu_kernel_ref.h
@@ -25,12 +25,10 @@
 #ifndef __RELU_KERNEL_REF_H__
 #define __RELU_KERNEL_REF_H__
 
-
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
 
-
 int ref_relu_fp32(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope);
 
 int ref_relu_fp16(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope);
diff --git a/source/device/cpu/op/relu/relu_kernel_ref_fp16.c b/source/device/cpu/op/relu/relu_kernel_ref_fp16.c
index a47f3de15..d5fab13f7 100644
--- a/source/device/cpu/op/relu/relu_kernel_ref_fp16.c
+++ b/source/device/cpu/op/relu/relu_kernel_ref_fp16.c
@@ -36,7 +36,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 #if MACOS
 #else
 int ref_relu_fp16(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope)
@@ -50,7 +49,7 @@ int ref_relu_fp16(struct tensor* input_tensor, struct tensor* output_tensor, flo
     fp16_t* output_fp16 = (fp16_t*)output_tensor->data;
     float* input_fp32 = (float*)sys_malloc(total_size * sizeof(float));
 
-    for(int i=0; i< total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
         input_fp32[i] = fp16_to_fp32(input_fp16[i]);
     }
@@ -78,7 +77,7 @@ int ref_relu_fp16(struct tensor* input_tensor, struct tensor* output_tensor, flo
     }
 
     /* cost fp32 to fp16 */
-    for(int i=0; i<total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
         output_fp16[i] = fp32_to_fp16(input_fp32[i]);
     }
diff --git a/source/device/cpu/op/relu/relu_kernel_ref_fp32.c b/source/device/cpu/op/relu/relu_kernel_ref_fp32.c
index 7130f3450..49fb8b8d7 100644
--- a/source/device/cpu/op/relu/relu_kernel_ref_fp32.c
+++ b/source/device/cpu/op/relu/relu_kernel_ref_fp32.c
@@ -36,7 +36,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 int ref_relu_fp32(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope)
 {
     int total_size = input_tensor->elem_num;
diff --git a/source/device/cpu/op/relu/relu_kernel_ref_int8.c b/source/device/cpu/op/relu/relu_kernel_ref_int8.c
index a43f844f8..885444b29 100644
--- a/source/device/cpu/op/relu/relu_kernel_ref_int8.c
+++ b/source/device/cpu/op/relu/relu_kernel_ref_int8.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 int ref_relu_int8(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope)
 {
     int total_size = input_tensor->elem_num;
@@ -51,9 +50,9 @@ int ref_relu_int8(struct tensor* input_tensor, struct tensor* output_tensor, flo
 
     float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float));
 
-    for(int i=0; i<total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
-        data_fp32[i] = (float )input_int8[i] * input_scale;
+        data_fp32[i] = (float)input_int8[i] * input_scale;
     }
 
     /* process */
@@ -79,7 +78,7 @@ int ref_relu_int8(struct tensor* input_tensor, struct tensor* output_tensor, flo
     }
 
     /* quant */
-    for(int i=0; i<total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
         int data_i32 = round(data_fp32[i] / output_scale);
         if (data_i32 > 127)
diff --git a/source/device/cpu/op/relu/relu_kernel_ref_uint8.c b/source/device/cpu/op/relu/relu_kernel_ref_uint8.c
index f687332ff..1b64308cd 100644
--- a/source/device/cpu/op/relu/relu_kernel_ref_uint8.c
+++ b/source/device/cpu/op/relu/relu_kernel_ref_uint8.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 int ref_relu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, float negative_slope)
 {
     int total_size = input_tensor->elem_num;
@@ -53,9 +52,9 @@ int ref_relu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, fl
 
     float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float));
 
-    for(int i=0; i<total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
-        data_fp32[i] = ((float )input_uint8[i] - (float )input_zero) * input_scale;
+        data_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     /* process */
@@ -81,7 +80,7 @@ int ref_relu_uint8(struct tensor* input_tensor, struct tensor* output_tensor, fl
     }
 
     /* quant */
-    for(int i=0; i<total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
         int udata = round(data_fp32[i] / output_scale + output_zero);
         if (udata > 255)
diff --git a/source/device/cpu/op/relu/relu_ref.c b/source/device/cpu/op/relu/relu_ref.c
index b4a1e66f3..2b0372686 100644
--- a/source/device/cpu/op/relu/relu_ref.c
+++ b/source/device/cpu/op/relu/relu_ref.c
@@ -38,7 +38,6 @@
 
 #include "relu_kernel_ref.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -56,17 +55,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct relu_param* relu_param = ( struct relu_param* )ir_node->op.param_mem;
+    struct relu_param* relu_param = (struct relu_param*)ir_node->op.param_mem;
 
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_relu_fp32(input_tensor, output_tensor, relu_param->negative_slope);
     else if (input_tensor->data_type == TENGINE_DT_FP16)
-        #if MACOS
+#if MACOS
         TLOG_ERR("FP16 not support mac os");
-        #else
+#else
         ret = ref_relu_fp16(input_tensor, output_tensor, relu_param->negative_slope);
-        #endif
+#endif
     else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_relu_uint8(input_tensor, output_tensor, relu_param->negative_slope);
     else if (input_tensor->data_type == TENGINE_DT_INT8)
diff --git a/source/device/cpu/op/relu1/relu1_ref.c b/source/device/cpu/op/relu1/relu1_ref.c
index bee4fd347..337bc5812 100644
--- a/source/device/cpu/op/relu1/relu1_ref.c
+++ b/source/device/cpu/op/relu1/relu1_ref.c
@@ -32,7 +32,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 int ref_relu1_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     int w = input_tensor->dims[3];
diff --git a/source/device/cpu/op/relu6/relu6_ref.c b/source/device/cpu/op/relu6/relu6_ref.c
index 834565e2a..98bfa2006 100644
--- a/source/device/cpu/op/relu6/relu6_ref.c
+++ b/source/device/cpu/op/relu6/relu6_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 int ref_relu6_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     int w = input_tensor->dims[3];
@@ -57,12 +56,12 @@ int ref_relu6_uint8(struct tensor* input_tensor, struct tensor* output_tensor, i
 
     float* data_fp32 = (float*)sys_malloc(total_size * sizeof(float));
 
-    for(int i = 0; i < total_size; i++)
-        data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale;
+    for (int i = 0; i < total_size; i++)
+        data_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
 
     for (int n = 0; n < batch; n++)
     {
-//#pragma omp parallel for num_threads(num_thread)
+        //#pragma omp parallel for num_threads(num_thread)
         for (int q = 0; q < channels; q++)
         {
             float* src = data_fp32 + batch_step * n + c_step * q;
@@ -73,14 +72,14 @@ int ref_relu6_uint8(struct tensor* input_tensor, struct tensor* output_tensor, i
                 dst[i] = src[i];
                 if (src[i] > 6)
                     dst[i] = 6;
-                else if(src[i] < 0)
+                else if (src[i] < 0)
                     dst[i] = 0;
             }
         }
     }
 
     // quant
-    for(int i=0; i<total_size; i++)
+    for (int i = 0; i < total_size; i++)
     {
         int udata = round(data_fp32[i] / output_scale + output_zero);
         if (udata > 255)
@@ -93,7 +92,6 @@ int ref_relu6_uint8(struct tensor* input_tensor, struct tensor* output_tensor, i
     return 0;
 }
 
-
 int ref_relu6_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     int w = input_tensor->dims[3];
@@ -144,10 +142,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-	int ret = -1;
-    if(input_tensor->data_type == TENGINE_DT_FP32)
+    int ret = -1;
+    if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_relu6_fp32(input_tensor, output_tensor, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_relu6_uint8(input_tensor, output_tensor, exec_graph->num_thread);
 
     return ret;
diff --git a/source/device/cpu/op/reorg/reorg_ref.c b/source/device/cpu/op/reorg/reorg_ref.c
index 84a976afd..3cff628a0 100644
--- a/source/device/cpu/op/reorg/reorg_ref.c
+++ b/source/device/cpu/op/reorg/reorg_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int ref_reorg_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct reorg_param* param,
                           int num_thread)
 {
@@ -98,7 +97,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct reorg_param* reorg_param = ( struct reorg_param* )ir_node->op.param_mem;
+    struct reorg_param* reorg_param = (struct reorg_param*)ir_node->op.param_mem;
 
     int ret = ref_reorg_fp32(input_tensor, output_tensor, reorg_param, exec_graph->num_thread);
     if (ret != 0)
diff --git a/source/device/cpu/op/reshape/reshape_ref.c b/source/device/cpu/op/reshape/reshape_ref.c
index a0e59b6fb..09ddd5f5b 100644
--- a/source/device/cpu/op/reshape/reshape_ref.c
+++ b/source/device/cpu/op/reshape/reshape_ref.c
@@ -35,7 +35,6 @@
 #include <math.h>
 #include <string.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -62,23 +61,26 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     switch (input_tensor->data_type)
     {
-        case TENGINE_DT_FP32:
-        case TENGINE_DT_INT32: {
-            size *= 4;
-            break;
-        }
-        case TENGINE_DT_FP16:
-        case TENGINE_DT_INT16: {
-            size *= 2;
-            break;
-        }
-        case TENGINE_DT_UINT8:
-        case TENGINE_DT_INT8: {
-            size *= 1;
-            break;
-        }
-        default:
-            return -1;
+    case TENGINE_DT_FP32:
+    case TENGINE_DT_INT32:
+    {
+        size *= 4;
+        break;
+    }
+    case TENGINE_DT_FP16:
+    case TENGINE_DT_INT16:
+    {
+        size *= 2;
+        break;
+    }
+    case TENGINE_DT_UINT8:
+    case TENGINE_DT_INT8:
+    {
+        size *= 1;
+        break;
+    }
+    default:
+        return -1;
     }
 
     if (size <= 0)
@@ -88,7 +90,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     /* transpose nchw to nhwc */
     //check dim size first???
-    if(input_tensor->dim_num == 4 && (output_tensor->dim_num == 2||output_tensor->dim_num == 3||output_tensor->dim_num == 4))
+    if (input_tensor->dim_num == 4 && (output_tensor->dim_num == 2 || output_tensor->dim_num == 3 || output_tensor->dim_num == 4))
     {
         if (ir_graph->model_layout == TENGINE_LAYOUT_NHWC)
         {
@@ -106,7 +108,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
                     float* input_fp32 = (float*)input_tensor->data;
                     float* output_fp32 = (float*)output_tensor->data;
-                    float* data_fp32_temp = ( float* )malloc(size);
+                    float* data_fp32_temp = (float*)malloc(size);
 
                     int index = 0;
                     for (int h = 0; h < in_h; h++)
@@ -138,7 +140,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
                     float* input_fp32 = (float*)input_tensor->data;
                     float* output_fp32 = (float*)output_tensor->data;
-                    float* data_fp32_temp = ( float* )malloc(size);
+                    float* data_fp32_temp = (float*)malloc(size);
 
                     int index = 0;
                     for (int h = 0; h < in_h; h++)
@@ -196,7 +198,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
                     uint8_t* input_uint8 = (uint8_t*)input_tensor->data;
                     uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
-                    uint8_t* data_uint8_temp = ( uint8_t* )malloc(size);
+                    uint8_t* data_uint8_temp = (uint8_t*)malloc(size);
 
                     int index = 0;
                     for (int h = 0; h < in_h; h++)
@@ -228,7 +230,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
                     uint8_t* input_uint8 = (uint8_t*)input_tensor->data;
                     uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
-                    uint8_t* data_uint8_temp = ( uint8_t* )malloc(size);
+                    uint8_t* data_uint8_temp = (uint8_t*)malloc(size);
 
                     int index = 0;
                     for (int h = 0; h < in_h; h++)
@@ -263,7 +265,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
                     int8_t* input_int8 = (int8_t*)input_tensor->data;
                     int8_t* output_int8 = (int8_t*)output_tensor->data;
-                    int8_t* data_int8_temp = ( int8_t* )malloc(size);
+                    int8_t* data_int8_temp = (int8_t*)malloc(size);
 
                     int index = 0;
                     for (int h = 0; h < in_h; h++)
@@ -295,7 +297,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
                     int8_t* input_int8 = (int8_t*)input_tensor->data;
                     int8_t* output_int8 = (int8_t*)output_tensor->data;
-                    int8_t* data_int8_temp = ( int8_t* )malloc(size);
+                    int8_t* data_int8_temp = (int8_t*)malloc(size);
 
                     int index = 0;
                     for (int h = 0; h < in_h; h++)
diff --git a/source/device/cpu/op/resize/resize_ref.c b/source/device/cpu/op/resize/resize_ref.c
index 6b2a38a5d..c787f3ec6 100644
--- a/source/device/cpu/op/resize/resize_ref.c
+++ b/source/device/cpu/op/resize/resize_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 #define T_MAX(a, b) ((a) > (b) ? (a) : (b))
 #define T_MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -75,8 +74,7 @@ static void bilinear_resize(float* inp, float* output, int h, int w, int c, floa
             for (int k = 0; k < c; k++)
             {
                 int in_index = in_idx + k * in_hw;
-                output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy +
-                                               inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy;
+                output[k * out_hw + out_idx] = inp[in_index] * fx_0 * fy_0 + inp[in_index + w] * fx_0 * fy + inp[in_index + 1] * fx * fy_0 + inp[in_index + w + 1] * fx * fy;
             }
         }
     }
@@ -94,10 +92,10 @@ static void nearest_neighbor_resize(float* inp, float* out, int h, int w, int c_
         output = out + k * oh * ow;
         for (int i = 0; i < oh; i++)
         {
-            si = T_MIN(( int )(i * scale_y), h - 1);
+            si = T_MIN((int)(i * scale_y), h - 1);
             for (int j = 0; j < ow; j++)
             {
-                sj = T_MIN(( int )(j * scale_x), w - 1);
+                sj = T_MIN((int)(j * scale_x), w - 1);
                 output[i * ow + j] = input[si * w + sj];
             }
         }
@@ -128,14 +126,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct resize_param* resize_param = ( struct resize_param* )ir_node->op.param_mem;
+    struct resize_param* resize_param = (struct resize_param*)ir_node->op.param_mem;
 
     float scale_x = 1.f / resize_param->scale_w;
     float scale_y = 1.f / resize_param->scale_h;
     int in_chw = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3];
     int out_chw = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3];
-    float* input = ( float* )input_tensor->data;
-    float* output = ( float* )output_tensor->data;
+    float* input = (float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
 
     if (resize_param->type == 0)
     {
diff --git a/source/device/cpu/op/reverse/reverse_ref.c b/source/device/cpu/op/reverse/reverse_ref.c
index 66b692614..07d4a6a9c 100644
--- a/source/device/cpu/op/reverse/reverse_ref.c
+++ b/source/device/cpu/op/reverse/reverse_ref.c
@@ -34,10 +34,9 @@
 
 #include <math.h>
 
-
 struct reverse_param
 {
-    int in_shape[4];    // the dim of the input
+    int in_shape[4]; // the dim of the input
     int dim_size;
 };
 
@@ -64,8 +63,7 @@ int ref_reverse_fp32(void* input, void* input_axis, void* output, const struct r
                     {
                         for (int x = 0; x < param->in_shape[3]; x++)
                         {
-                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] =
-                                in_ptr[(param->in_shape[0] - 1 - i) * in_chw + j * in_hw + y * in_w + x];
+                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[(param->in_shape[0] - 1 - i) * in_chw + j * in_hw + y * in_w + x];
                         }
                     }
                 }
@@ -82,8 +80,7 @@ int ref_reverse_fp32(void* input, void* input_axis, void* output, const struct r
                     {
                         for (int x = 0; x < param->in_shape[3]; x++)
                         {
-                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] =
-                                in_ptr[i * in_chw + (param->in_shape[1] - 1 - j) * in_hw + y * in_w + x];
+                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + (param->in_shape[1] - 1 - j) * in_hw + y * in_w + x];
                         }
                     }
                 }
@@ -100,8 +97,7 @@ int ref_reverse_fp32(void* input, void* input_axis, void* output, const struct r
                     {
                         for (int x = 0; x < param->in_shape[3]; x++)
                         {
-                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] =
-                                in_ptr[i * in_chw + j * in_hw + (param->in_shape[2] - 1 - y) * in_w + x];
+                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + j * in_hw + (param->in_shape[2] - 1 - y) * in_w + x];
                         }
                     }
                 }
@@ -118,8 +114,7 @@ int ref_reverse_fp32(void* input, void* input_axis, void* output, const struct r
                     {
                         for (int x = 0; x < param->in_shape[3]; x++)
                         {
-                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] =
-                                in_ptr[i * in_chw + j * in_hw + y * in_w + (param->in_shape[3] - 1 - x)];
+                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + j * in_hw + y * in_w + (param->in_shape[3] - 1 - x)];
                         }
                     }
                 }
@@ -157,8 +152,7 @@ int ref_reverse_uint8(void* input, void* input_axis, void* output, const struct
                     {
                         for (int x = 0; x < param->in_shape[3]; x++)
                         {
-                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] =
-                                in_ptr[(param->in_shape[0] - 1 - i) * in_chw + j * in_hw + y * in_w + x];
+                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[(param->in_shape[0] - 1 - i) * in_chw + j * in_hw + y * in_w + x];
                         }
                     }
                 }
@@ -175,8 +169,7 @@ int ref_reverse_uint8(void* input, void* input_axis, void* output, const struct
                     {
                         for (int x = 0; x < param->in_shape[3]; x++)
                         {
-                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] =
-                                in_ptr[i * in_chw + (param->in_shape[1] - 1 - j) * in_hw + y * in_w + x];
+                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + (param->in_shape[1] - 1 - j) * in_hw + y * in_w + x];
                         }
                     }
                 }
@@ -193,8 +186,7 @@ int ref_reverse_uint8(void* input, void* input_axis, void* output, const struct
                     {
                         for (int x = 0; x < param->in_shape[3]; x++)
                         {
-                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] =
-                                in_ptr[i * in_chw + j * in_hw + (param->in_shape[2] - 1 - y) * in_w + x];
+                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + j * in_hw + (param->in_shape[2] - 1 - y) * in_w + x];
                         }
                     }
                 }
@@ -211,8 +203,7 @@ int ref_reverse_uint8(void* input, void* input_axis, void* output, const struct
                     {
                         for (int x = 0; x < param->in_shape[3]; x++)
                         {
-                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] =
-                                in_ptr[i * in_chw + j * in_hw + y * in_w + (param->in_shape[3] - 1 - x)];
+                            out_ptr[i * in_chw + j * in_hw + y * in_w + x] = in_ptr[i * in_chw + j * in_hw + y * in_w + (param->in_shape[3] - 1 - x)];
                         }
                     }
                 }
@@ -267,9 +258,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_reverse_fp32(input_tensor->data, axis_tensor->data, output_tensor->data, &reverse_param,
                                exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_reverse_uint8(input_tensor->data, axis_tensor->data, output_tensor->data, &reverse_param,
-                               exec_graph->num_thread);
+                                exec_graph->num_thread);
 
     return ret;
 }
diff --git a/source/device/cpu/op/rnn/rnn_ref.c b/source/device/cpu/op/rnn/rnn_ref.c
index 37b258689..ee60e4247 100644
--- a/source/device/cpu/op/rnn/rnn_ref.c
+++ b/source/device/cpu/op/rnn/rnn_ref.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 struct rnn_ref_param
 {
     float* init_h_data;
@@ -87,10 +86,10 @@ static int do_RNN_step(const float* input, float* init_h, const float* kernel, c
     int input_total_size = input_size + hidden_size;
     int batch_cell_size = hidden_size * batch_size;
 
-    float* ig = ( float* )malloc(batch_cell_size * sizeof(float));
+    float* ig = (float*)malloc(batch_cell_size * sizeof(float));
 
-    float* merged_input = ( float* )malloc(sizeof(float) * batch_size * (input_total_size));
-    float* matmul_result = ( float* )malloc(sizeof(float) * batch_size * hidden_size);
+    float* merged_input = (float*)malloc(sizeof(float) * batch_size * (input_total_size));
+    float* matmul_result = (float*)malloc(sizeof(float) * batch_size * hidden_size);
 
     // merge input
     concat_axis_1_rnn(input, init_h, merged_input, batch_size, input_size, hidden_size);
@@ -123,7 +122,7 @@ static int do_RNN_step(const float* input, float* init_h, const float* kernel, c
 
 static int ref_rnn_fp32(float* input, float* output, struct rnn_ref_param* param)
 {
-    float* init_h = ( float* )malloc((unsigned long )param->batch_size * param->hidden_size * sizeof(float));
+    float* init_h = (float*)malloc((unsigned long)param->batch_size * param->hidden_size * sizeof(float));
     if (param->init_h_data)
     {
         for (int i = 0; i < param->batch_size; i++)
@@ -133,7 +132,7 @@ static int ref_rnn_fp32(float* input, float* output, struct rnn_ref_param* param
     }
     else
     {
-        memset(init_h, 0x0, sizeof((unsigned long )param->batch_size * param->hidden_size * sizeof(float)));
+        memset(init_h, 0x0, sizeof((unsigned long)param->batch_size * param->hidden_size * sizeof(float)));
     }
 
     int ret = 0;
@@ -151,7 +150,7 @@ static int ref_rnn_fp32(float* input, float* output, struct rnn_ref_param* param
         // final_state [batch_size,hidden_size]
         if (i + param->output_len >= param->seq_lens)
         {
-            memcpy(output, init_h, (unsigned long )param->batch_size * param->hidden_size * sizeof(float));
+            memcpy(output, init_h, (unsigned long)param->batch_size * param->hidden_size * sizeof(float));
             output += param->batch_size * param->hidden_size;
         }
     }
@@ -180,7 +179,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct tensor* output_tensor;
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
     int in_num = ir_node->input_num;
-    struct rnn_param* rnn_param = ( struct rnn_param* )ir_node->op.param_mem;
+    struct rnn_param* rnn_param = (struct rnn_param*)ir_node->op.param_mem;
     struct tensor* init_h_tensor;
 
     for (int count = 0; count < in_num; count++)
@@ -215,7 +214,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     kernel_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct rnn_param* rnn_param = ( struct rnn_param* )ir_node->op.param_mem;
+    struct rnn_param* rnn_param = (struct rnn_param*)ir_node->op.param_mem;
 
     int input_size = rnn_param->input_size;
     int hidden_size = rnn_param->hidden_size;
@@ -227,7 +226,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int batch_size = input_tensor->dims[1];
     int output_len = rnn_param->output_len;
 
-    float* init_h = ( float* )malloc((size_t)batch_size * hidden_size * sizeof(float));
+    float* init_h = (float*)malloc((size_t)batch_size * hidden_size * sizeof(float));
     if (init_h == NULL)
     {
         return -1;
diff --git a/source/device/cpu/op/roialign/roialign_ref.c b/source/device/cpu/op/roialign/roialign_ref.c
index 599998f0a..61de55300 100644
--- a/source/device/cpu/op/roialign/roialign_ref.c
+++ b/source/device/cpu/op/roialign/roialign_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 #define T_MAX(a, b) ((a) > (b) ? (a) : (b))
 #define T_MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -95,8 +94,8 @@ static int ref_roialign_fp32(struct tensor* input_tensor, struct tensor* roi_ten
     float roi_w = T_MAX(roi_x2 - roi_x1, 1);
     float roi_h = T_MAX(roi_y2 - roi_y1, 1);
 
-    float bin_size_w = roi_w / ( float )w;
-    float bin_size_h = roi_h / ( float )h;
+    float bin_size_w = roi_w / (float)w;
+    float bin_size_h = roi_h / (float)h;
 
     int channel = input_tensor->dims[1];
     int in_height = input_tensor->dims[2];
@@ -123,10 +122,10 @@ static int ref_roialign_fp32(struct tensor* input_tensor, struct tensor* roi_ten
                 float hend = roi_y1 + (ph + 1) * bin_size_h;
                 float wend = roi_x1 + (pw + 1) * bin_size_w;
 
-                hstart = T_MIN(T_MAX(hstart, 0.f), ( float )in_height);
-                wstart = T_MIN(T_MAX(wstart, 0.f), ( float )in_width);
-                hend = T_MIN(T_MAX(hend, 0.f), ( float )in_height);
-                wend = T_MIN(T_MAX(wend, 0.f), ( float )in_width);
+                hstart = T_MIN(T_MAX(hstart, 0.f), (float)in_height);
+                wstart = T_MIN(T_MAX(wstart, 0.f), (float)in_width);
+                hend = T_MIN(T_MAX(hend, 0.f), (float)in_height);
+                wend = T_MIN(T_MAX(wend, 0.f), (float)in_width);
 
                 int bin_grid_h = ceil(hend - hstart);
                 int bin_grid_w = ceil(wend - wstart);
@@ -137,18 +136,18 @@ static int ref_roialign_fp32(struct tensor* input_tensor, struct tensor* roi_ten
                 float sum = 0.f;
                 for (int by = 0; by < bin_grid_h; by++)
                 {
-                    float y = hstart + (by + 0.5f) * bin_size_h / ( float )bin_grid_h;
+                    float y = hstart + (by + 0.5f) * bin_size_h / (float)bin_grid_h;
 
                     for (int bx = 0; bx < bin_grid_w; bx++)
                     {
-                        float x = wstart + (bx + 0.5f) * bin_size_w / ( float )bin_grid_w;
+                        float x = wstart + (bx + 0.5f) * bin_size_w / (float)bin_grid_w;
 
                         // bilinear interpolate at (x,y)
                         float v = bilinear_interpolate(ptr, in_width, in_height, x, y);
                         sum += v;
                     }
                 }
-                outptr[pw] = is_empty ? 0.f : (sum / ( float )area);
+                outptr[pw] = is_empty ? 0.f : (sum / (float)area);
             }
             outptr += w;
         }
@@ -178,7 +177,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     roi_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct roialign_param* roialign_param = ( struct roialign_param* )ir_node->op.param_mem;
+    struct roialign_param* roialign_param = (struct roialign_param*)ir_node->op.param_mem;
 
     ref_roialign_fp32(input_tensor, roi_tensor, output_tensor, roialign_param, exec_graph->num_thread);
 
diff --git a/source/device/cpu/op/roipooling/roipooling_ref.c b/source/device/cpu/op/roipooling/roipooling_ref.c
index 3a59ca997..cf554bbec 100644
--- a/source/device/cpu/op/roipooling/roipooling_ref.c
+++ b/source/device/cpu/op/roipooling/roipooling_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -72,8 +71,8 @@ static int ref_roipooling_fp32(struct tensor* input_tensor, struct tensor* roi_t
         int roi_w = MAX(roi_x1 - roi_x0 + 1, 1);
         int roi_h = MAX(roi_y1 - roi_y0 + 1, 1);
 
-        float bin_w = ( float )roi_w / ( float )out_w;
-        float bin_h = ( float )roi_h / ( float )out_h;
+        float bin_w = (float)roi_w / (float)out_w;
+        float bin_h = (float)roi_h / (float)out_h;
 
         for (int c = 0; c < channel; ++c)
         {
@@ -83,10 +82,10 @@ static int ref_roipooling_fp32(struct tensor* input_tensor, struct tensor* roi_t
             {
                 for (int w = 0; w < out_w; ++w)
                 {
-                    int h0 = roi_y0 + ( int )floor((double)( h )*bin_h);
-                    int h1 = roi_y0 + ( int )ceil((double)(h + 1) * bin_h);
-                    int w0 = roi_x0 + ( int )floor((double)( w )*bin_w);
-                    int w1 = roi_x0 + ( int )ceil((double)(w + 1) * bin_w);
+                    int h0 = roi_y0 + (int)floor((double)(h)*bin_h);
+                    int h1 = roi_y0 + (int)ceil((double)(h + 1) * bin_h);
+                    int w0 = roi_x0 + (int)floor((double)(w)*bin_w);
+                    int w1 = roi_x0 + (int)ceil((double)(w + 1) * bin_w);
 
                     h0 = MIN(MAX(h0, 0), in_h);
                     h1 = MIN(MAX(h1, 0), in_h);
@@ -134,7 +133,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     roi_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct roipooling_param* roipooling_param = ( struct roipooling_param* )ir_node->op.param_mem;
+    struct roipooling_param* roipooling_param = (struct roipooling_param*)ir_node->op.param_mem;
 
     // set output dims
     int dims[4];
@@ -156,7 +155,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct roipooling_param* roipooling_param = ( struct roipooling_param* )node->op.param_mem;
+    struct roipooling_param* roipooling_param = (struct roipooling_param*)node->op.param_mem;
 
     int dims[4];
 
diff --git a/source/device/cpu/op/round/round_ref.c b/source/device/cpu/op/round/round_ref.c
index 1524fa1a0..ca76ee7d6 100644
--- a/source/device/cpu/op/round/round_ref.c
+++ b/source/device/cpu/op/round/round_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 int ref_round_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     // dims size = 2 or 3
diff --git a/source/device/cpu/op/rpn/rpn_ref.c b/source/device/cpu/op/rpn/rpn_ref.c
index a9d20813d..6112e332b 100644
--- a/source/device/cpu/op/rpn/rpn_ref.c
+++ b/source/device/cpu/op/rpn/rpn_ref.c
@@ -38,20 +38,19 @@
 #include <math.h>
 #include <string.h>
 
-
 struct anchor_box
 {
-    float x0;    // xmin
-    float y0;    // ymin
-    float x1;    // xmax
-    float y1;    // ymax
+    float x0; // xmin
+    float y0; // ymin
+    float x1; // xmax
+    float y1; // ymax
 };
 struct RPN_Box
 {
-    float x0;    // xmin
-    float y0;    // ymin
-    float x1;    // xmax
-    float y1;    // ymax
+    float x0; // xmin
+    float y0; // ymin
+    float x1; // xmax
+    float y1; // ymax
     float score;
 };
 
@@ -174,9 +173,9 @@ void nms_rpn_boxes(struct RPN_Box* input_boxes, int* size, float nms_thresh)
     int input_size = *size;
     int output_size = 0;
 
-    struct RPN_Box* output_boxes = ( struct RPN_Box* )sys_malloc(sizeof(struct RPN_Box) * input_size);
-    float* areas = ( float* )sys_malloc(sizeof(float) * input_size);
-    int* picked = ( int* )sys_malloc(sizeof(int) * input_size);
+    struct RPN_Box* output_boxes = (struct RPN_Box*)sys_malloc(sizeof(struct RPN_Box) * input_size);
+    float* areas = (float*)sys_malloc(sizeof(float) * input_size);
+    int* picked = (int*)sys_malloc(sizeof(int) * input_size);
 
     for (int i = 0; i < input_size; ++i)
     {
@@ -220,13 +219,13 @@ void ref_proposal_local_anchor(int feat_height, int feat_width, int feat_stride,
                                float* local_anchors)
 {
     int feat_size = feat_height * feat_width;
-    int num_anchors = ( int )anchors->elem_num;
+    int num_anchors = (int)anchors->elem_num;
     for (int i = 0; i < num_anchors; ++i)
     {
         for (int j = 0; j < feat_height; j++)
             for (int k = 0; k < feat_width; k++)
             {
-                Anchor_t anchor_val = *( Anchor_t* )(get_vector_data(anchors, i));
+                Anchor_t anchor_val = *(Anchor_t*)(get_vector_data(anchors, i));
                 local_anchors[(i * 4 + 0) * feat_size + j * feat_width + k] = anchor_val.x0 + k * feat_stride;
                 local_anchors[(i * 4 + 1) * feat_size + j * feat_width + k] = anchor_val.y0 + j * feat_stride;
                 local_anchors[(i * 4 + 2) * feat_size + j * feat_width + k] = anchor_val.x1 + k * feat_stride;
@@ -242,7 +241,7 @@ int ref_rpn_fp32(const float* score, float* featmap, float* anchors, float* outp
     int featmap_size = param->feat_height * param->feat_width * param->feat_chan;
     int max_num_boxes = featmap_size / 4;
 
-    struct RPN_Box* boxes = ( struct RPN_Box* )sys_malloc(max_num_boxes * sizeof(struct RPN_Box));
+    struct RPN_Box* boxes = (struct RPN_Box*)sys_malloc(max_num_boxes * sizeof(struct RPN_Box));
 
     bbox_tranform_inv(featmap, anchors, param);
 
@@ -301,7 +300,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
-    rpn_param_t* _param = ( struct rpn_param* )(ir_node->op.param_mem);
+    rpn_param_t* _param = (struct rpn_param*)(ir_node->op.param_mem);
     struct graph* ir_graph = ir_node->graph;
     struct tensor* score_tensor;
     struct tensor* featmap_tensor;
@@ -315,11 +314,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     const void* score_org = score_tensor->data;
     void* featmap_org = featmap_tensor->data;
-    const float* info_org = ( float* )info_tensor->data;
+    const float* info_org = (float*)info_tensor->data;
     void* output_org = output_tensor->data;
 
     struct rpn_param_ref param;
-    param.num_anchors = ( int )_param->anchors_->elem_num;
+    param.num_anchors = (int)_param->anchors_->elem_num;
     param.feat_chan = featmap_tensor->dims[1];
     param.feat_height = featmap_tensor->dims[2];
     param.feat_width = featmap_tensor->dims[3];
@@ -334,7 +333,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     param.min_size = _param->min_size;
     param.feat_stride = _param->feat_stride;
     int size = param.num_anchors * 4 * feat_size;
-    float* local_anchors = ( float* )sys_malloc(size * sizeof(float));
+    float* local_anchors = (float*)sys_malloc(size * sizeof(float));
 
     ref_proposal_local_anchor(featmap_tensor->dims[2], featmap_tensor->dims[3], _param->feat_stride, _param->anchors_,
                               local_anchors);
diff --git a/source/device/cpu/op/scale/scale_ref.c b/source/device/cpu/op/scale/scale_ref.c
index 7a1c30d51..426fcd2c8 100644
--- a/source/device/cpu/op/scale/scale_ref.c
+++ b/source/device/cpu/op/scale/scale_ref.c
@@ -34,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 int ref_scale_fp32(struct tensor* input_tensor, struct tensor* gamma_tensor, struct tensor* beta_tensor,
                    struct tensor* output_tensor, struct scale_param* param, int num_thread)
 {
@@ -110,7 +109,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         beta_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct scale_param* scale_param = ( struct scale_param* )ir_node->op.param_mem;
+    struct scale_param* scale_param = (struct scale_param*)ir_node->op.param_mem;
 
     ref_scale_fp32(input_tensor, gamma_tensor, beta_tensor, output_tensor, scale_param, exec_graph->num_thread);
 
diff --git a/source/device/cpu/op/scatter/scatter_ref.c b/source/device/cpu/op/scatter/scatter_ref.c
index ca5b8b598..cb0e2ed69 100644
--- a/source/device/cpu/op/scatter/scatter_ref.c
+++ b/source/device/cpu/op/scatter/scatter_ref.c
@@ -38,7 +38,6 @@
 #include <stdbool.h>
 #include <string.h>
 
-
 struct ref_scatter_param
 {
     int axis;
@@ -51,11 +50,9 @@ struct ref_scatter_param
     int indiceSize;
 };
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct ref_scatter_param* scatter_op_param =
-        (struct ref_scatter_param*)sys_malloc(sizeof(struct ref_scatter_param));
+    struct ref_scatter_param* scatter_op_param = (struct ref_scatter_param*)sys_malloc(sizeof(struct ref_scatter_param));
     memset(scatter_op_param, 0, sizeof(struct ref_scatter_param));
     exec_node->ops_priv = scatter_op_param;
     return 0;
@@ -72,47 +69,52 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    struct ref_scatter_param* scatter_op_param = ( struct ref_scatter_param* )exec_node->ops_priv;
+    struct ref_scatter_param* scatter_op_param = (struct ref_scatter_param*)exec_node->ops_priv;
     struct scatter_param* param = (struct scatter_param*)(ir_node->op.param_mem);
     scatter_op_param->dim_size = input_tensor->dim_num;
     scatter_op_param->is_onnx = param->is_onnx;
-    for(int i = 0; i < 4; i++){
+    for (int i = 0; i < 4; i++)
+    {
         scatter_op_param->dims[i] = 1;
     }
-    
-    if(scatter_op_param->is_onnx){
+
+    if (scatter_op_param->is_onnx)
+    {
         struct tensor* indices_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
         int indicesDimsSize = indices_tensor->dim_num;
-        scatter_op_param->indice_dim = (int*)malloc(sizeof(int)*indicesDimsSize);
+        scatter_op_param->indice_dim = (int*)malloc(sizeof(int) * indicesDimsSize);
         scatter_op_param->indiceSize = indicesDimsSize;
-    
+
         struct tensor* updates_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
         int updatesDimsSize = updates_tensor->dim_num;
-        scatter_op_param->update_dim = (int*)malloc(sizeof(int)*updatesDimsSize);
+        scatter_op_param->update_dim = (int*)malloc(sizeof(int) * updatesDimsSize);
         scatter_op_param->updateSize = updatesDimsSize;
     }
 
     return 0;
 }
 
-static int ref_scatter_fp32(float* input, float* output, int* indices, float* updates, struct ref_scatter_param* op_param){
+static int ref_scatter_fp32(float* input, float* output, int* indices, float* updates, struct ref_scatter_param* op_param)
+{
     int axis = op_param->axis;
     bool is_onnx = op_param->is_onnx;
     TLOG_ERR("indices %f %f \n", updates[0], updates[1]);
     TLOG_ERR("indices %d %d \n", indices[0], indices[1]);
     int outSize = 1;
-    for(int i = 0; i < op_param->dim_size; i++){
-        outSize *= op_param->dims[4-op_param->dim_size+i];
+    for (int i = 0; i < op_param->dim_size; i++)
+    {
+        outSize *= op_param->dims[4 - op_param->dim_size + i];
     }
-    memcpy(output, input, sizeof(float)*outSize);
+    memcpy(output, input, sizeof(float) * outSize);
 
     int calIndexDims[4];
     int realIndexDims[4];
-    int outCalAxis[4] ;
+    int outCalAxis[4];
     int outRealAxis[4];
     int updateDims[4];
 
-    for(int i = 0; i< 4; i++){
+    for (int i = 0; i < 4; i++)
+    {
         calIndexDims[i] = 0;
         realIndexDims[i] = 1;
         outCalAxis[i] = 0;
@@ -122,16 +124,18 @@ static int ref_scatter_fp32(float* input, float* output, int* indices, float* up
 
     int diff = 4 - op_param->updateSize;
     //TLOG_ERR("update size: %d \n", op_param->updateSize);
-    for(int i=0; i < op_param->updateSize; i++){
+    for (int i = 0; i < op_param->updateSize; i++)
+    {
         calIndexDims[diff + i] = op_param->update_dim[i];
         realIndexDims[diff + i] = op_param->update_dim[i];
-        TLOG_ERR("%d %d \n",calIndexDims[diff + i], realIndexDims[diff + i]);
+        TLOG_ERR("%d %d \n", calIndexDims[diff + i], realIndexDims[diff + i]);
     }
 
     diff = 4 - op_param->dim_size;
-    for(int i = 0; i < op_param->dim_size; i++){
+    for (int i = 0; i < op_param->dim_size; i++)
+    {
         outCalAxis[diff + i] = 1;
-        outRealAxis[diff+i] = op_param->dims[diff+i];
+        outRealAxis[diff + i] = op_param->dims[diff + i];
     }
     outCalAxis[diff + op_param->axis] = 2;
 
@@ -142,123 +146,151 @@ static int ref_scatter_fp32(float* input, float* output, int* indices, float* up
     TLOG_ERR("Ready for test\n");
     // TLOG_ERR("reaslIndexDims: %d %d %d %d \n", realIndexDims[0] ,realIndexDims[1], realIndexDims[2],realIndexDims[3]);
     // op_param->axis = -1;
-    if(is_onnx){
-        if(op_param->axis != -1){
-            if(op_param->dim_size == 1){
+    if (is_onnx)
+    {
+        if (op_param->axis != -1)
+        {
+            if (op_param->dim_size == 1)
+            {
                 TLOG_ERR("dims 1\n");
-                for(int n = 0; n < realIndexDims[0]; n++){
-                    for(int c = 0; c < realIndexDims[1]; c++){
-                        for(int h = 0; h < realIndexDims[2]; h++){
-                            for(int w = 0; w < realIndexDims[3]; w++){
-                                
-                                int ii = n*calIndexDims[1]*calIndexDims[2]*calIndexDims[3]+c*calIndexDims[2]*calIndexDims[3]+h*calIndexDims[3]+w;
+                for (int n = 0; n < realIndexDims[0]; n++)
+                {
+                    for (int c = 0; c < realIndexDims[1]; c++)
+                    {
+                        for (int h = 0; h < realIndexDims[2]; h++)
+                        {
+                            for (int w = 0; w < realIndexDims[3]; w++)
+                            {
+                                int ii = n * calIndexDims[1] * calIndexDims[2] * calIndexDims[3] + c * calIndexDims[2] * calIndexDims[3] + h * calIndexDims[3] + w;
                                 int index = indices[ii];
-                                if(index < 0){
+                                if (index < 0)
+                                {
                                     index = inW + index + 1;
                                 }
                                 float value = updates[ii];
 
                                 int outIndex = index;
                                 output[outIndex] = value;
-                        
-
                             }
                         }
                     }
                 }
-            } else if(op_param->dim_size == 2){
+            }
+            else if (op_param->dim_size == 2)
+            {
                 TLOG_ERR("dims 2 in \n");
-                for(int n = 0; n < realIndexDims[0]; n++){
-                    for(int c = 0; c < realIndexDims[1]; c++){
-                        for(int h = 0; h < realIndexDims[2]; h++){
-                            for(int w = 0; w < realIndexDims[3]; w++){
+                for (int n = 0; n < realIndexDims[0]; n++)
+                {
+                    for (int c = 0; c < realIndexDims[1]; c++)
+                    {
+                        for (int h = 0; h < realIndexDims[2]; h++)
+                        {
+                            for (int w = 0; w < realIndexDims[3]; w++)
+                            {
                                 TLOG_ERR("cadsfasd \n");
-                                int ii = n*calIndexDims[1]*calIndexDims[2]*calIndexDims[3]+c*calIndexDims[2]*calIndexDims[3]+h*calIndexDims[3]+w;
+                                int ii = n * calIndexDims[1] * calIndexDims[2] * calIndexDims[3] + c * calIndexDims[2] * calIndexDims[3] + h * calIndexDims[3] + w;
                                 TLOG_ERR("cadsfasd 2 %d \n", ii);
                                 int index = indices[ii];
                                 TLOG_ERR("cadsfasd 3\n");
                                 float value = updates[ii];
                                 TLOG_ERR("dims 2ddd\n");
-                                if(op_param->axis == 1){
+                                if (op_param->axis == 1)
+                                {
                                     index = index < 0 ? inW + index + 1 : index;
-                                    
-                                    int outIndex = h*realIndexDims[3] + index;
+
+                                    int outIndex = h * realIndexDims[3] + index;
                                     TLOG_ERR("%d %d \n", index, outIndex);
                                     output[outIndex] = value;
                                 }
-                                if(op_param->axis == 0){
-                                    index = index < 0 ? inH + index + 1: index;
-                                    
-                                    int outIndex = index*realIndexDims[3] + w;
+                                if (op_param->axis == 0)
+                                {
+                                    index = index < 0 ? inH + index + 1 : index;
+
+                                    int outIndex = index * realIndexDims[3] + w;
                                     TLOG_ERR("%d %d \n", index, outIndex);
                                     output[outIndex] = value;
                                 }
-
                             }
                         }
                     }
                 }
-            } else if(op_param->dim_size == 3) {
+            }
+            else if (op_param->dim_size == 3)
+            {
                 TLOG_ERR("dims 3\n");
-                for(int n = 0; n < realIndexDims[0]; n++){
-                    for(int c = 0; c < realIndexDims[1]; c++){
-                        for(int h = 0; h < realIndexDims[2]; h++){
-                            for(int w = 0; w < realIndexDims[3]; w++){
-                                
-                                int ii = n*calIndexDims[1]*calIndexDims[2]*calIndexDims[3]+c*calIndexDims[2]*calIndexDims[3]+h*calIndexDims[3]+w;
+                for (int n = 0; n < realIndexDims[0]; n++)
+                {
+                    for (int c = 0; c < realIndexDims[1]; c++)
+                    {
+                        for (int h = 0; h < realIndexDims[2]; h++)
+                        {
+                            for (int w = 0; w < realIndexDims[3]; w++)
+                            {
+                                int ii = n * calIndexDims[1] * calIndexDims[2] * calIndexDims[3] + c * calIndexDims[2] * calIndexDims[3] + h * calIndexDims[3] + w;
                                 int index = indices[ii];
                                 float value = updates[ii];
 
-                                if(op_param->axis == 1){
-                                    index = index < 0 ? inH + index + 1: index;
-                                    int outIndex = c*inH*inW + index*realIndexDims[3] + w;
+                                if (op_param->axis == 1)
+                                {
+                                    index = index < 0 ? inH + index + 1 : index;
+                                    int outIndex = c * inH * inW + index * realIndexDims[3] + w;
                                     output[outIndex] = value;
                                 }
-                                if(op_param->axis == 0){
-                                    index = index < 0 ? inC + index + 1: index;
+                                if (op_param->axis == 0)
+                                {
+                                    index = index < 0 ? inC + index + 1 : index;
                                     // TLOG_ERR("%d \n", index);
-                                    int outIndex = index*inH*inW + h*realIndexDims[3] + w;
+                                    int outIndex = index * inH * inW + h * realIndexDims[3] + w;
                                     output[outIndex] = value;
                                 }
-                                if(op_param->axis == 2){
-                                    index = index < 0 ? inW + index + 1: index;
-                                    int outIndex = c*inH*inW + h*realIndexDims[3] + index;
+                                if (op_param->axis == 2)
+                                {
+                                    index = index < 0 ? inW + index + 1 : index;
+                                    int outIndex = c * inH * inW + h * realIndexDims[3] + index;
                                     output[outIndex] = value;
                                 }
-
                             }
                         }
                     }
                 }
-            } else if(op_param->dim_size == 4){
+            }
+            else if (op_param->dim_size == 4)
+            {
                 TLOG_ERR("dims 4\n");
-                for(int n = 0; n < realIndexDims[0]; n++){
-                    for(int c = 0; c < realIndexDims[1]; c++){
-                        for(int h = 0; h < realIndexDims[2]; h++){
-                            for(int w = 0; w < realIndexDims[3]; w++){
-                                
-                                int ii = n*calIndexDims[1]*calIndexDims[2]*calIndexDims[3]+c*calIndexDims[2]*calIndexDims[3]+h*calIndexDims[3]+w;
+                for (int n = 0; n < realIndexDims[0]; n++)
+                {
+                    for (int c = 0; c < realIndexDims[1]; c++)
+                    {
+                        for (int h = 0; h < realIndexDims[2]; h++)
+                        {
+                            for (int w = 0; w < realIndexDims[3]; w++)
+                            {
+                                int ii = n * calIndexDims[1] * calIndexDims[2] * calIndexDims[3] + c * calIndexDims[2] * calIndexDims[3] + h * calIndexDims[3] + w;
                                 int index = indices[ii];
                                 float value = updates[ii];
 
-                                if(op_param->axis == 1){
-                                    index = index < 0 ? inC + index + 1: index;
-                                    int outIndex = n*inC*inH*inW + index*inH*inW + h*realIndexDims[3] + w;
+                                if (op_param->axis == 1)
+                                {
+                                    index = index < 0 ? inC + index + 1 : index;
+                                    int outIndex = n * inC * inH * inW + index * inH * inW + h * realIndexDims[3] + w;
                                     output[outIndex] = value;
                                 }
-                                if(op_param->axis == 0){
-                                    index = index < 0 ? inN + index + 1: index;
-                                    int outIndex = index*inC*inH*inW + c*inH*inW + h*realIndexDims[3] + w;
+                                if (op_param->axis == 0)
+                                {
+                                    index = index < 0 ? inN + index + 1 : index;
+                                    int outIndex = index * inC * inH * inW + c * inH * inW + h * realIndexDims[3] + w;
                                     output[outIndex] = value;
                                 }
-                                if(op_param->axis == 2){
-                                    index = index < 0 ? inH + index + 1: index;
-                                    int outIndex = n*inC*inH*inW + c*inH*inW + index*realIndexDims[3] + w;
+                                if (op_param->axis == 2)
+                                {
+                                    index = index < 0 ? inH + index + 1 : index;
+                                    int outIndex = n * inC * inH * inW + c * inH * inW + index * realIndexDims[3] + w;
                                     output[outIndex] = value;
                                 }
-                                if(op_param->axis == 3){
-                                    index = index < 0 ? inW + index + 1: index;
-                                    int outIndex = n*inC*inH*inW + c*inH*inW + h*realIndexDims[3] + index;
+                                if (op_param->axis == 3)
+                                {
+                                    index = index < 0 ? inW + index + 1 : index;
+                                    int outIndex = n * inC * inH * inW + c * inH * inW + h * realIndexDims[3] + index;
                                     output[outIndex] = value;
                                 }
                             }
@@ -266,38 +298,46 @@ static int ref_scatter_fp32(float* input, float* output, int* indices, float* up
                     }
                 }
             }
-        } else {
+        }
+        else
+        {
             int data_dims[4] = {1};
-            for(int i = 0; i < op_param->dim_size; i++){
+            for (int i = 0; i < op_param->dim_size; i++)
+            {
                 data_dims[3 - i] = op_param->dims[i];
             }
 
-            int iCHW = data_dims[1]* data_dims[2]* data_dims[3];
-            int iHW = data_dims[2]*data_dims[3];
-            
-            
-            for(int i = 0; i < op_param->updateSize; i++){
+            int iCHW = data_dims[1] * data_dims[2] * data_dims[3];
+            int iHW = data_dims[2] * data_dims[3];
+
+            for (int i = 0; i < op_param->updateSize; i++)
+            {
                 updateDims[4 - op_param->updateSize + i] = op_param->update_dim[i];
             }
 
-            int uCHW = updateDims[1]*updateDims[2]*updateDims[3];
-            int uHW = updateDims[2]*updateDims[3];
-            for(int n = 0; n < updateDims[0]; n++){
-                for(int c = 0; c < updateDims[1]; c++){
-                    for(int h = 0; h < updateDims[2]; h++){
-                        for(int w = 0; w < updateDims[3]; w++){
-                            int updateIndex = n*uCHW + c * uHW + h*updateDims[3] + w;
+            int uCHW = updateDims[1] * updateDims[2] * updateDims[3];
+            int uHW = updateDims[2] * updateDims[3];
+            for (int n = 0; n < updateDims[0]; n++)
+            {
+                for (int c = 0; c < updateDims[1]; c++)
+                {
+                    for (int h = 0; h < updateDims[2]; h++)
+                    {
+                        for (int w = 0; w < updateDims[3]; w++)
+                        {
+                            int updateIndex = n * uCHW + c * uHW + h * updateDims[3] + w;
                             int value = updates[updateIndex];
                             int index = indices[updateIndex];
-                            int outIndex = n*iCHW + c*iHW + w * updateDims[2] + index;
+                            int outIndex = n * iCHW + c * iHW + w * updateDims[2] + index;
                             output[outIndex] = value;
-
                         }
-                    }    
+                    }
                 }
             }
         }
-    } else {
+    }
+    else
+    {
         return -1;
     }
 
@@ -309,45 +349,51 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    struct ref_scatter_param* scatter_op_param = ( struct ref_scatter_param* )exec_node->ops_priv;
+    struct ref_scatter_param* scatter_op_param = (struct ref_scatter_param*)exec_node->ops_priv;
     struct scatter_param* param = (struct scatter_param*)(ir_node->op.param_mem);
 
     int inputDimsSize = input_tensor->dim_num;
-    for(int i = 0; i < inputDimsSize; i++){
-        scatter_op_param->dims[4-inputDimsSize+i] = input_tensor->dims[i];
+    for (int i = 0; i < inputDimsSize; i++)
+    {
+        scatter_op_param->dims[4 - inputDimsSize + i] = input_tensor->dims[i];
     }
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
     scatter_op_param->axis = param->axis;
     scatter_op_param->is_onnx = param->is_onnx;
-    if(scatter_op_param->is_onnx){
+    if (scatter_op_param->is_onnx)
+    {
         struct tensor* indices_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
         int indicesDimsSize = indices_tensor->dim_num;
-        for(int i = 0; i < indicesDimsSize; i++){
+        for (int i = 0; i < indicesDimsSize; i++)
+        {
             scatter_op_param->indice_dim[i] = indices_tensor->dims[i];
         }
         struct tensor* updates_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
         int updatesDimsSize = updates_tensor->dim_num;
-        for(int i  = 0 ; i < updatesDimsSize; i++){
+        for (int i = 0; i < updatesDimsSize; i++)
+        {
             scatter_op_param->update_dim[i] = updates_tensor->dims[i];
         }
-        TLOG_ERR("Indecues %d \n",indicesDimsSize);
-        
-        int ret = ref_scatter_fp32((float*)input_tensor->data, (float*)output_tensor->data, 
-            (int*)indices_tensor->data, (float*)updates_tensor->data, scatter_op_param);
-        if(ret < 0){
+        TLOG_ERR("Indecues %d \n", indicesDimsSize);
+
+        int ret = ref_scatter_fp32((float*)input_tensor->data, (float*)output_tensor->data,
+                                   (int*)indices_tensor->data, (float*)updates_tensor->data, scatter_op_param);
+        if (ret < 0)
+        {
             TLOG_ERR("Scatter reference error \n");
         }
-    } else {
+    }
+    else
+    {
         return -1;
     }
 
-
     return 0;
 }
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct ref_scatter_param* scatter_op_param = ( struct ref_scatter_param* )exec_node->ops_priv;
+    struct ref_scatter_param* scatter_op_param = (struct ref_scatter_param*)exec_node->ops_priv;
 
     sys_free(scatter_op_param->indice_dim);
     sys_free(scatter_op_param->update_dim);
diff --git a/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c b/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c
index 6fbea330e..026625d71 100644
--- a/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c
+++ b/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c
@@ -34,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -59,7 +58,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct selu_param* selu_param = ( struct selu_param* )ir_node->op.param_mem;
+    struct selu_param* selu_param = (struct selu_param*)ir_node->op.param_mem;
 
     int num_thread = exec_graph->num_thread;
 
diff --git a/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.c b/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.c
index 28f68424c..4da7f06f8 100644
--- a/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.c
+++ b/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.c
@@ -30,11 +30,10 @@
 
 #include <arm_neon.h>
 
-
 void selu_kernel(int i, int id, void* data, const float* input, float* output, float alpha, float lambda)
 {
     float alpha_lambda = alpha * lambda;
-    int step = (( int* )data)[0];
+    int step = ((int*)data)[0];
     float32x4_t _one = vdupq_n_f32(1.f);
     float32x4_t _zero = vdupq_n_f32(0.f);
     float32x4_t _alpha_lambda = vdupq_n_f32(alpha_lambda);
@@ -71,8 +70,8 @@ void selu_kernel(int i, int id, void* data, const float* input, float* output, f
 int selu_run(struct tensor* output_tensor, struct tensor* input_tensor, struct selu_param* selu_param,
              int num_thread)
 {
-    float* data = ( float* )input_tensor->data;
-    float* out_data = ( float* )output_tensor->data;
+    float* data = (float*)input_tensor->data;
+    float* out_data = (float*)output_tensor->data;
     float alpha = selu_param->alpha;
     float lambda = selu_param->lambda;
 
diff --git a/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.h b/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.h
index 91220aa06..3ed2955cd 100644
--- a/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.h
+++ b/source/device/cpu/op/selu/cortex-a/selu_kernel_arm.h
@@ -29,7 +29,6 @@
 
 #include "graph/tensor.h"
 
-
 int selu_run(struct tensor* output_tensor, struct tensor* input_tensor, struct selu_param* selu_param, int num_thread);
 
 #endif
diff --git a/source/device/cpu/op/selu/selu_ref.c b/source/device/cpu/op/selu/selu_ref.c
index 789e1df1b..557f8105d 100644
--- a/source/device/cpu/op/selu/selu_ref.c
+++ b/source/device/cpu/op/selu/selu_ref.c
@@ -36,12 +36,11 @@
 
 #include <math.h>
 
-
 int ref_selu_fp32(struct tensor* output_tensor, struct tensor* input_tensor, struct selu_param* selu_param,
                   int num_thread)
 {
-    float* data = ( float* )input_tensor->data;
-    float* out_data = ( float* )output_tensor->data;
+    float* data = (float*)input_tensor->data;
+    float* out_data = (float*)output_tensor->data;
     float alpha = selu_param->alpha;
     float lambda = selu_param->lambda;
     float alpha_lambda = alpha * lambda;
@@ -53,8 +52,8 @@ int ref_selu_fp32(struct tensor* output_tensor, struct tensor* input_tensor, str
     for (int i = 0; i < chan_num; i++)
     {
         int offset = i * chan_size;
-        float* input_data = ( float* )input_tensor->data + i * chan_size;
-        float* output_data = ( float* )output_tensor->data + i * chan_size;
+        float* input_data = (float*)input_tensor->data + i * chan_size;
+        float* output_data = (float*)output_tensor->data + i * chan_size;
 
         for (int j = 0; j < chan_size; j++)
         {
@@ -69,7 +68,7 @@ int ref_selu_fp32(struct tensor* output_tensor, struct tensor* input_tensor, str
 }
 
 int ref_selu_uint8(struct tensor* output_tensor, struct tensor* input_tensor, struct selu_param* selu_param,
-                  int num_thread)
+                   int num_thread)
 {
     /* dequant */
     uint8_t* input_uint8 = (uint8_t*)input_tensor->data;
@@ -81,12 +80,12 @@ int ref_selu_uint8(struct tensor* output_tensor, struct tensor* input_tensor, st
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* input_data = ( float* )sys_malloc(input_size * sizeof(float));
-    float* output_data = ( float* )sys_malloc(output_size * sizeof(float));
+    float* input_data = (float*)sys_malloc(input_size * sizeof(float));
+    float* output_data = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        input_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     float alpha = selu_param->alpha;
@@ -100,8 +99,8 @@ int ref_selu_uint8(struct tensor* output_tensor, struct tensor* input_tensor, st
     for (int i = 0; i < chan_num; i++)
     {
         int offset = i * chan_size;
-        input_data = ( float* )input_tensor->data + i * chan_size;
-        output_data = ( float* )output_tensor->data + i * chan_size;
+        input_data = (float*)input_tensor->data + i * chan_size;
+        output_data = (float*)output_tensor->data + i * chan_size;
 
         for (int j = 0; j < chan_size; j++)
         {
@@ -151,14 +150,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct selu_param* selu_param = ( struct selu_param* )ir_node->op.param_mem;
+    struct selu_param* selu_param = (struct selu_param*)ir_node->op.param_mem;
 
     int num_thread = exec_graph->num_thread;
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_selu_fp32(output_tensor, input_tensor, selu_param, num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_selu_uint8(output_tensor, input_tensor, selu_param, num_thread);
 
     return ret;
diff --git a/source/device/cpu/op/shape/shape_ref.c b/source/device/cpu/op/shape/shape_ref.c
index c515c8505..ec27a9c41 100644
--- a/source/device/cpu/op/shape/shape_ref.c
+++ b/source/device/cpu/op/shape/shape_ref.c
@@ -32,7 +32,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -67,7 +66,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     const int* inDims = input_tensor->dims;
     int inDims_size = input_tensor->dim_num;
     int* outData = (int*)output_tensor->data;
-    for(int i = 0; i < inDims_size; i++){
+    for (int i = 0; i < inDims_size; i++)
+    {
         *outData = inDims[i];
         outData++;
     }
diff --git a/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c b/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c
index f1838ccb7..545bf2fc0 100644
--- a/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c
+++ b/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 int ref_shuffle_channel_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct shuffle_channel_param* param)
 {
     int batch = input_tensor->dims[0];
@@ -141,7 +140,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 
-    if (input_tensor->dim_num !=4)
+    if (input_tensor->dim_num != 4)
     {
         TLOG_ERR("dims num is not 4, not support shuffle channel\n");
         return -1;
@@ -156,14 +155,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct shuffle_channel_param* param = ( struct shuffle_channel_param* )ir_node->op.param_mem;
+    struct shuffle_channel_param* param = (struct shuffle_channel_param*)ir_node->op.param_mem;
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_shuffle_channel_fp32(input_tensor, output_tensor, param);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_shuffle_channel_uint8(input_tensor, output_tensor, param);
-    else if(input_tensor->data_type == TENGINE_DT_INT8)
+    else if (input_tensor->data_type == TENGINE_DT_INT8)
         ret = ref_shuffle_channel_int8(input_tensor, output_tensor, param);
     else
         TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);
diff --git a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c
index 3fb563818..1b7b3fbaf 100644
--- a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c
+++ b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c
@@ -34,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
diff --git a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.c b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.c
index 47713d9dc..af186d50c 100644
--- a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.c
+++ b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.c
@@ -28,7 +28,6 @@
 
 #include <arm_neon.h>
 
-
 #define SIGMOID_MAX(a, b) ((a) > (b) ? (a) : (b))
 #define SIGMOID_MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -55,9 +54,9 @@ static float fast_exp1(float x)
     float t = x * 1.442695041f;
     float fi = floorf(t);
     float f = t - fi;
-    int i = ( int )fi;
+    int i = (int)fi;
     cvt.f = (0.3371894346f * f + 0.657636276f) * f + 1.00172476f; /* compute 2^f */
-    cvt.i += (i << 23); /* scale by 2^i */
+    cvt.i += (i << 23);                                           /* scale by 2^i */
     return cvt.f;
 }
 
@@ -71,12 +70,10 @@ static float acl_exp(float x)
 
     /* exp(x) = = 2^k * exp(x-k ln2); k = round（x/ln2）*/
     float t = x * 1.4426950408f;
-    float f = x - (( int )t) * 0.6931471805f;
-    int i = ( int )t;
+    float f = x - ((int)t) * 0.6931471805f;
+    int i = (int)t;
     /// cvt.f = (0.3371894346f * f + 0.657636276f) * f + 1.00172476f;       /* compute 2^f */
-    cvt.f =
-        1 + f * 1.00000011921f + (0.0416598916054f + f * 0.00833693705499f) * f * f +
-        ((0.500000596046f + f * 0.166665703058f) + (0.0014122662833f + f * 0.000195780929062f) * f * f) * f * f * f * f;
+    cvt.f = 1 + f * 1.00000011921f + (0.0416598916054f + f * 0.00833693705499f) * f * f + ((0.500000596046f + f * 0.166665703058f) + (0.0014122662833f + f * 0.000195780929062f) * f * f) * f * f * f * f;
     cvt.i += (i << 23); /* scale by 2^i */
     return cvt.f;
 }
@@ -125,8 +122,8 @@ static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, struct tab* coeffs)
 /* ACL exp function impelement */
 static inline float32x4_t vexpq_f32(float32x4_t x)
 {
-    const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f);    // ln(2)
-    const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f);    // 1/ln(2)
+    const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f);     // ln(2)
+    const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
     const float32x4_t CONST_0 = vdupq_n_f32(0.f);
     const int32x4_t CONST_NEGATIVE_126 = vdupq_n_s32(-126);
 
@@ -148,7 +145,7 @@ exp(x) = lim(1+x/n)^n       // n=10
 */
 static inline float32x4_t vexpq10_f32(float32x4_t x)
 {
-    x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f);    // n = 10
+    x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f); // n = 10
     x = vmulq_f32(x, x);
     x = vmulq_f32(x, x);
     x = vmulq_f32(x, x);
@@ -165,8 +162,8 @@ static inline float32x4_t vexpq10_f32(float32x4_t x)
 int sigmoid_run(struct tensor* output_tensor, struct tensor* input_tensor, int num_thread)
 {
     init_tab();
-    float* input = ( float* )input_tensor->data;
-    float* output = ( float* )output_tensor->data;
+    float* input = (float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
 
     float32x4_t min = vdupq_n_f32(-30.0f);
     float32x4_t max = vdupq_n_f32(30.0f);
diff --git a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.h b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.h
index c0fc8a80f..276ee54e8 100644
--- a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.h
+++ b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_kernel_arm.h
@@ -29,7 +29,6 @@
 
 #include <arm_neon.h>
 
-
 struct tab
 {
     float32x4_t a0;
diff --git a/source/device/cpu/op/sigmoid/sigmoid_ref.c b/source/device/cpu/op/sigmoid/sigmoid_ref.c
index a347c1684..fd2286f65 100644
--- a/source/device/cpu/op/sigmoid/sigmoid_ref.c
+++ b/source/device/cpu/op/sigmoid/sigmoid_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 #define SIGMOID_MAX(a, b) ((a) > (b) ? (a) : (b))
 #define SIGMOID_MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -44,19 +43,19 @@ int ref_sigmoid_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
 
     if (dim_num == 4)
     {
-        int batch   = input_tensor->dims[0];
+        int batch = input_tensor->dims[0];
         int channel = input_tensor->dims[1];
-        int cstep   = input_tensor->dims[2] * input_tensor->dims[3];
-        int bstep   = channel * cstep;
+        int cstep = input_tensor->dims[2] * input_tensor->dims[3];
+        int bstep = channel * cstep;
 
-        for (int n=0; n<batch; n++)
+        for (int n = 0; n < batch; n++)
         {
 #pragma omp parallel for num_threads(num_thread)
-            for (int c=0; c<channel; c++)
+            for (int c = 0; c < channel; c++)
             {
-                float* input_data  = (float*)input_tensor->data + n * bstep + c * cstep;
+                float* input_data = (float*)input_tensor->data + n * bstep + c * cstep;
                 float* output_data = (float*)output_tensor->data + n * bstep + c * cstep;
-                for (int i=0; i<cstep; i++)
+                for (int i = 0; i < cstep; i++)
                 {
                     output_data[i] = SIGMOID_MIN(input_data[i], 30.0f);
                     output_data[i] = SIGMOID_MAX(input_data[i], -30.0f);
@@ -78,13 +77,12 @@ int ref_sigmoid_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
             output_data[i] = 1.f / (1 + expf(-output_data[i]));
         }
     }
-	
+
     return 0;
 }
 
 int ref_sigmoid_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
-
     /* dequant */
     uint8_t* input_uint8 = (uint8_t*)input_tensor->data;
     uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
@@ -95,12 +93,12 @@ int ref_sigmoid_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float));
-	float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float));
+    float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float));
+    float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        input_fp32[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     for (int i = 0; i < input_size; i++)
@@ -123,7 +121,7 @@ int ref_sigmoid_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
     }
 
     sys_free(input_fp32);
-	sys_free(output_fp32);
+    sys_free(output_fp32);
 
     return 0;
 }
@@ -149,8 +147,7 @@ static int reshape_node(struct node_ops* node_ops, struct exec_node* exec_node,
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] ||
-        input_tensor->dims[3] != output_tensor->dims[3])
+    if (input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3])
         ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num);
 
     return ret;
@@ -168,12 +165,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_sigmoid_fp32(input_tensor, output_tensor, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_sigmoid_uint8(input_tensor, output_tensor, exec_graph->num_thread);
-    
+
     return ret;
 }
 
diff --git a/source/device/cpu/op/slice/slice_ref.c b/source/device/cpu/op/slice/slice_ref.c
index 54d295c51..04825d9aa 100644
--- a/source/device/cpu/op/slice/slice_ref.c
+++ b/source/device/cpu/op/slice/slice_ref.c
@@ -39,25 +39,24 @@
 #include <string.h>
 #include <stdbool.h>
 
-
 struct shape_dim
 {
-    int dims[4];    // for caffe
-    int begins[4];    // for tf
-    int sizes[4];    // for tf
+    int dims[4];   // for caffe
+    int begins[4]; // for tf
+    int sizes[4];  // for tf
 };
 
 struct slice_param_ref
 {
-    int in_shape[4];    // the dim of the input
+    int in_shape[4]; // the dim of the input
     int in_shape_3[3];
     int in_shape_2[2];
-    struct shape_dim* output_shape;    // out shape
+    struct shape_dim* output_shape; // out shape
     int out_num;
     int dim_num;
-    int axis;    // for caffe
-    int step;    // for onnx
-    float out_scale;    // for input tensor int8
+    int axis;        // for caffe
+    int step;        // for onnx
+    float out_scale; // for input tensor int8
     bool iscaffe;
     bool ismxnet;
     bool isonnx;
@@ -128,8 +127,7 @@ static int tf_run(const int8_t* in_data, int8_t** out_data, int element_size, co
             for (int j = start_dim_2; j < stop_dim_2; ++j)
             {
                 int len = stop_dim_3 - start_dim_3;
-                int input_off =
-                    n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + start_dim_3;
+                int input_off = n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + start_dim_3;
                 memcpy(output, input + input_off * element_size, (size_t)len * element_size);
                 output += len * element_size;
             }
@@ -168,8 +166,7 @@ static int mxnet_run(const int8_t* in_data, int8_t** out_data, int element_size,
                 for (int j = start_2; j < stop_2; ++j)
                 {
                     int len = start_3 - stop_3;
-                    int input_off =
-                        n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + start_3;
+                    int input_off = n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + start_3;
                     memcpy(output, input + input_off * element_size, (size_t)len * element_size);
                     output += len * element_size;
                 }
@@ -267,8 +264,7 @@ static int onnx_run(const int8_t* in_data, int8_t** out_data, int element_size,
                     {
                         for (int k = start_3; k < stop_3; k = k + step_3)
                         {
-                            int input_index =
-                                n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + k;
+                            int input_index = n * in_dim_1 * in_dim_2 * in_dim_3 + i * in_dim_2 * in_dim_3 + j * in_dim_3 + k;
                             memcpy(output, input + input_index * element_size, element_size);
                             output += element_size;
                         }
@@ -373,16 +369,16 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct tensor* input_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
     struct slice_param_ref op_param;
-    slice_param_t* _param = ( struct slice_param* )(ir_node->op.param_mem);
+    slice_param_t* _param = (struct slice_param*)(ir_node->op.param_mem);
 
     int out_num = exec_node->output_num;
 
     struct shape_dim sd[MAX_SHAPE_DIM_NUM * 2];
-    int8_t** out_data_ptrs = ( int8_t** )sys_malloc(out_num * sizeof(int8_t*));
-    if(out_data_ptrs == NULL)
+    int8_t** out_data_ptrs = (int8_t**)sys_malloc(out_num * sizeof(int8_t*));
+    if (out_data_ptrs == NULL)
     {
         return -1;
     }
@@ -390,12 +386,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     op_param.axis = _param->axis;
     op_param.output_shape = sd;
     op_param.out_num = out_num;
-    op_param.dim_num = ( int )(input_tensor->dim_num);
+    op_param.dim_num = (int)(input_tensor->dim_num);
     op_param.iscaffe = _param->iscaffe;
     op_param.ismxnet = _param->ismxnet;
     op_param.isonnx = _param->isonnx;
 
-    int8_t* input = ( int8_t* )input_tensor->data;
+    int8_t* input = (int8_t*)input_tensor->data;
     unsigned int mem_size = input_tensor->elem_size;
 
     if (op_param.iscaffe)
@@ -413,7 +409,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
             {
                 op_param.output_shape[i].dims[j] = out_tensor->dims[j];
             }
-            out_data_ptrs[i] = ( int8_t* )out_tensor->data;
+            out_data_ptrs[i] = (int8_t*)out_tensor->data;
         }
     }
     else if (op_param.ismxnet || op_param.isonnx)
@@ -439,17 +435,16 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
             }
         }
         struct tensor* out_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-        out_data_ptrs[0] = ( int8_t* )out_tensor->data;
+        out_data_ptrs[0] = (int8_t*)out_tensor->data;
 
-        if (input_tensor->dims[0] == output_tensor->dims[0] && input_tensor->dims[1] == output_tensor->dims[1] &&
-            input_tensor->dims[2] == output_tensor->dims[2] && input_tensor->dims[3] == output_tensor->dims[3])
+        if (input_tensor->dims[0] == output_tensor->dims[0] && input_tensor->dims[1] == output_tensor->dims[1] && input_tensor->dims[2] == output_tensor->dims[2] && input_tensor->dims[3] == output_tensor->dims[3])
         {
-            memcpy(( void* )(out_data_ptrs[0]), ( void* )input, mem_size*input_tensor->elem_num);
+            memcpy((void*)(out_data_ptrs[0]), (void*)input, mem_size * input_tensor->elem_num);
             sys_free(out_data_ptrs);
             return true;
         }
     }
-    else    // For tensorflow, there is only one output tensor
+    else // For tensorflow, there is only one output tensor
     {
         int maxdim = 4;
         int real_dim = op_param.dim_num;
@@ -464,14 +459,14 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
             }
             else
             {
-                op_param.output_shape[0].begins[idx] = *( int* )get_vector_data(_param->begin_, dim_idx);
-                op_param.output_shape[0].sizes[idx] = *( int* )get_vector_data(_param->size_, dim_idx);
+                op_param.output_shape[0].begins[idx] = *(int*)get_vector_data(_param->begin_, dim_idx);
+                op_param.output_shape[0].sizes[idx] = *(int*)get_vector_data(_param->size_, dim_idx);
                 op_param.in_shape[idx] = input_tensor->dims[dim_idx];
                 dim_idx++;
             }
         }
         struct tensor* out_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-        out_data_ptrs[0] = ( int8_t* )out_tensor->data;
+        out_data_ptrs[0] = (int8_t*)out_tensor->data;
     }
 
     int ret = -1;
@@ -487,19 +482,19 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         int32_t input_zero = input_tensor->zero_point;
         int32_t output_zero = output_tensor->zero_point;
 
-        float* input_fp32  = (float*)sys_malloc(input_tensor->elem_num * sizeof(float));
+        float* input_fp32 = (float*)sys_malloc(input_tensor->elem_num * sizeof(float));
         float* output_fp32 = (float*)sys_malloc(output_tensor->elem_num * sizeof(float));
-        out_data_ptrs[0] = ( int8_t* )output_fp32;
+        out_data_ptrs[0] = (int8_t*)output_fp32;
 
-        for(int i=0; i<input_tensor->elem_num; i++)
+        for (int i = 0; i < input_tensor->elem_num; i++)
         {
-            input_fp32[i] = ((float )input_uint8[i] - (float )input_zero) * input_scale;
+            input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
         }
 
-        ret = ref_slice_common((int8_t *)input_fp32, out_data_ptrs, sizeof(float), &op_param);
+        ret = ref_slice_common((int8_t*)input_fp32, out_data_ptrs, sizeof(float), &op_param);
 
         /* quant to uint8 */
-        for(int i=0; i<output_tensor->elem_num; i++)
+        for (int i = 0; i < output_tensor->elem_num; i++)
         {
             int udata = round(output_fp32[i] / output_scale + output_zero);
             if (udata > 255)
diff --git a/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c b/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c
index ac41456a5..9ffe8e5c2 100644
--- a/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c
+++ b/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c
@@ -40,7 +40,6 @@
 
 #include <arm_neon.h>
 
-
 static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
@@ -52,16 +51,15 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] ||
-        input_tensor->dims[3] != output_tensor->dims[3])
-    ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num);
+    if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3])
+        ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num);
 
     return ret;
 }
 
 static inline float32x4_t vexpq10_f32(float32x4_t x)
 {
-    x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f);    // n = 10
+    x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f); // n = 10
     x = vmulq_f32(x, x);
     x = vmulq_f32(x, x);
     x = vmulq_f32(x, x);
@@ -77,8 +75,8 @@ static inline float32x4_t vexpq10_f32(float32x4_t x)
 
 static void GetMaxArray(float* input, float* array, int in_size, int on_size, int num_thread)
 {
-    float* input_ptr = ( float* )input;
-    float* array_ptr = ( float* )array;
+    float* input_ptr = (float*)input;
+    float* array_ptr = (float*)array;
     memset(array, 0, in_size * sizeof(float));
 
     // #pragma omp parallel for num_threads(num_thread)
@@ -115,10 +113,10 @@ static void GetMaxArray(float* input, float* array, int in_size, int on_size, in
 static void GetOutResult(float* input, float* output, float* maxarray, float* sum_array, int in_size, int on_size,
                          int num_thread)
 {
-    float* input_ptr = ( float* )input;
-    float* output_ptr = ( float* )output;
-    float* maxarray_ptr = ( float* )maxarray;
-    float* sum_array_ptr = ( float* )sum_array;
+    float* input_ptr = (float*)input;
+    float* output_ptr = (float*)output;
+    float* maxarray_ptr = (float*)maxarray;
+    float* sum_array_ptr = (float*)sum_array;
 
     memset(sum_array, 0x0, in_size * sizeof(float));
 
@@ -183,7 +181,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct softmax_param* softmax_param = ( struct softmax_param* )ir_node->op.param_mem;
+    struct softmax_param* softmax_param = (struct softmax_param*)ir_node->op.param_mem;
 
     int element_size = input_tensor->elem_size;
     int dims[4];
@@ -211,8 +209,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     uint8_t* input = input_tensor->data;
     uint8_t* output = output_tensor->data;
-    float* max_array = ( float* )malloc(in_size * sizeof(float));
-    float* sum_array = ( float* )malloc(in_size * sizeof(float));
+    float* max_array = (float*)malloc(in_size * sizeof(float));
+    float* sum_array = (float*)malloc(in_size * sizeof(float));
 
     int on_in_size = on_size * in_size;
 
@@ -221,8 +219,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     if (element_size == 1)
     {
-        input_f = ( float* )malloc(on_in_size * 4);
-        output_f = ( float* )malloc(on_in_size * 4);
+        input_f = (float*)malloc(on_in_size * 4);
+        output_f = (float*)malloc(on_in_size * 4);
 
         /* todo */
 
@@ -235,8 +233,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         /* get max */
         int img_base = i * on_in_size * element_size;
 
-        GetMaxArray(( float* )(input + img_base), max_array, in_size, on_size, exec_graph->num_thread);
-        GetOutResult(( float* )(input + img_base), ( float* )(output + img_base), max_array, sum_array, in_size,
+        GetMaxArray((float*)(input + img_base), max_array, in_size, on_size, exec_graph->num_thread);
+        GetOutResult((float*)(input + img_base), (float*)(output + img_base), max_array, sum_array, in_size,
                      on_size, exec_graph->num_thread);
     }
 
diff --git a/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c b/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c
index 3c63e3a3c..93678c225 100644
--- a/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c
+++ b/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c
@@ -34,7 +34,6 @@
 
 #include "arm_math.h"
 
-
 /**
  * @brief Q7 softmax function
  * @param[in]       vec_in      pointer to input vector
@@ -55,9 +54,8 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] ||
-        input_tensor->dims[3] != output_tensor->dims[3])
-    ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num);
+    if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3])
+        ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num);
 
     return ret;
 }
diff --git a/source/device/cpu/op/softmax/softmax_kernel_ref.h b/source/device/cpu/op/softmax/softmax_kernel_ref.h
index 2cf5cd4c9..5c58a44de 100644
--- a/source/device/cpu/op/softmax/softmax_kernel_ref.h
+++ b/source/device/cpu/op/softmax/softmax_kernel_ref.h
@@ -25,7 +25,6 @@
 #ifndef __SOFTMAX_KERNEL_REF_H__
 #define __SOFTMAX_KERNEL_REF_H__
 
-
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
@@ -33,11 +32,10 @@
 #include <math.h>
 #include <string.h>
 
-
 static void GetMaxArray(void* input, void* array, int in_size, int on_size)
 {
-    float* input_ptr = ( float* )input;
-    float* array_ptr = ( float* )array;
+    float* input_ptr = (float*)input;
+    float* array_ptr = (float*)array;
 
     memcpy(array_ptr, input_ptr, in_size * sizeof(float));
 
@@ -53,10 +51,10 @@ static void GetMaxArray(void* input, void* array, int in_size, int on_size)
 
 static void GetOutResult(void* input, void* output, void* array, void* sum_array, int in_size, int on_size)
 {
-    float* input_ptr = ( float* )input;
-    float* output_ptr = ( float* )output;
-    float* array_ptr = ( float* )array;
-    float* sum_array_ptr = ( float* )sum_array;
+    float* input_ptr = (float*)input;
+    float* output_ptr = (float*)output;
+    float* array_ptr = (float*)array;
+    float* sum_array_ptr = (float*)sum_array;
 
     memset(sum_array, 0x0, in_size * sizeof(float));
 
diff --git a/source/device/cpu/op/softmax/softmax_kernel_ref_fp32.c b/source/device/cpu/op/softmax/softmax_kernel_ref_fp32.c
index ecf256746..351be8b21 100644
--- a/source/device/cpu/op/softmax/softmax_kernel_ref_fp32.c
+++ b/source/device/cpu/op/softmax/softmax_kernel_ref_fp32.c
@@ -36,13 +36,12 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 int ref_softmax_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int axis)
 {
     int element_size = input_tensor->elem_size;
     int type = input_tensor->data_type;
 
-    int* dims = ( int* )sys_malloc(input_tensor->dim_num * sizeof(int));
+    int* dims = (int*)sys_malloc(input_tensor->dim_num * sizeof(int));
     for (int i = 0; i < input_tensor->dim_num; i++)
     {
         dims[i] = input_tensor->dims[i];
@@ -63,8 +62,8 @@ int ref_softmax_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
     }
     on_size = dims[axis];
 
-    float* max_array = ( float* )sys_malloc(in_size * sizeof(float));
-    float* sum_array = ( float* )sys_malloc(in_size * sizeof(float));
+    float* max_array = (float*)sys_malloc(in_size * sizeof(float));
+    float* sum_array = (float*)sys_malloc(in_size * sizeof(float));
 
     int on_in_size = on_size * in_size;
 
diff --git a/source/device/cpu/op/softmax/softmax_kernel_ref_int8.c b/source/device/cpu/op/softmax/softmax_kernel_ref_int8.c
index 015f5149e..7b13afd7f 100644
--- a/source/device/cpu/op/softmax/softmax_kernel_ref_int8.c
+++ b/source/device/cpu/op/softmax/softmax_kernel_ref_int8.c
@@ -38,13 +38,12 @@
 
 #include <math.h>
 
-
 int ref_softmax_int8(struct tensor* input_tensor, struct tensor* output_tensor, int axis)
 {
     int element_size = input_tensor->elem_size;
     int type = input_tensor->data_type;
 
-    int* dims = ( int* )sys_malloc(input_tensor->dim_num * sizeof(int));
+    int* dims = (int*)sys_malloc(input_tensor->dim_num * sizeof(int));
     for (int i = 0; i < input_tensor->dim_num; i++)
     {
         dims[i] = input_tensor->dims[i];
@@ -65,16 +64,16 @@ int ref_softmax_int8(struct tensor* input_tensor, struct tensor* output_tensor,
     }
     on_size = dims[axis];
 
-    float* max_array = ( float* )sys_malloc(in_size * sizeof(float));
-    float* sum_array = ( float* )sys_malloc(in_size * sizeof(float));
+    float* max_array = (float*)sys_malloc(in_size * sizeof(float));
+    float* sum_array = (float*)sys_malloc(in_size * sizeof(float));
 
     int on_in_size = on_size * in_size;
     int totol_size = on_in_size * out_size;
 
     int8_t* input = (int8_t*)input_tensor->data;
     int8_t* output = (int8_t*)output_tensor->data;
-    float* input_f = ( float* )sys_malloc(totol_size * 4);
-    float* output_f = ( float* )sys_malloc(totol_size * 4);
+    float* input_f = (float*)sys_malloc(totol_size * 4);
+    float* output_f = (float*)sys_malloc(totol_size * 4);
 
     float input_scale = input_tensor->scale;
     float output_scale = output_tensor->scale;
diff --git a/source/device/cpu/op/softmax/softmax_kernel_ref_uint8.c b/source/device/cpu/op/softmax/softmax_kernel_ref_uint8.c
index 08a3cdb58..93565ad5c 100644
--- a/source/device/cpu/op/softmax/softmax_kernel_ref_uint8.c
+++ b/source/device/cpu/op/softmax/softmax_kernel_ref_uint8.c
@@ -38,13 +38,12 @@
 
 #include <math.h>
 
-
 int ref_softmax_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int axis)
 {
     int element_size = input_tensor->elem_size;
     int type = input_tensor->data_type;
 
-    int* dims = ( int* )sys_malloc(input_tensor->dim_num * sizeof(int));
+    int* dims = (int*)sys_malloc(input_tensor->dim_num * sizeof(int));
     for (int i = 0; i < input_tensor->dim_num; i++)
     {
         dims[i] = input_tensor->dims[i];
@@ -65,16 +64,16 @@ int ref_softmax_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
     }
     on_size = dims[axis];
 
-    float* max_array = ( float* )sys_malloc(in_size * sizeof(float));
-    float* sum_array = ( float* )sys_malloc(in_size * sizeof(float));
+    float* max_array = (float*)sys_malloc(in_size * sizeof(float));
+    float* sum_array = (float*)sys_malloc(in_size * sizeof(float));
 
     int on_in_size = on_size * in_size;
     int totol_size = on_in_size * out_size;
 
     uint8_t* input = (uint8_t*)input_tensor->data;
     uint8_t* output = (uint8_t*)output_tensor->data;
-    float* input_f = ( float* )sys_malloc(totol_size * 4);
-    float* output_f = ( float* )sys_malloc(totol_size * 4);
+    float* input_f = (float*)sys_malloc(totol_size * 4);
+    float* output_f = (float*)sys_malloc(totol_size * 4);
 
     float input_scale = input_tensor->scale;
     float output_scale = output_tensor->scale;
diff --git a/source/device/cpu/op/softmax/softmax_ref.c b/source/device/cpu/op/softmax/softmax_ref.c
index 1042877b6..cb1a3b49d 100644
--- a/source/device/cpu/op/softmax/softmax_ref.c
+++ b/source/device/cpu/op/softmax/softmax_ref.c
@@ -38,7 +38,6 @@
 
 #include "softmax_kernel_ref.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -56,12 +55,13 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct softmax_param* softmax_param = ( struct softmax_param* )ir_node->op.param_mem;
+    struct softmax_param* softmax_param = (struct softmax_param*)ir_node->op.param_mem;
 
     // Check: axis must be in the range: [-input_tensor->dim_num, input_tensor->dim_num)
     // Note: Here we always assume 0 <= input_tensor->dim_num
     int axis = softmax_param->axis;
-    if (axis < -input_tensor->dim_num || input_tensor->dim_num <= axis) {
+    if (axis < -input_tensor->dim_num || input_tensor->dim_num <= axis)
+    {
         TLOG_ERR("Input softmax axis %d not to be supported.\n", axis);
         return -1;
     }
@@ -99,8 +99,7 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] ||
-        input_tensor->dims[3] != output_tensor->dims[3])
+    if (input_tensor->dims[0] != output_tensor->dims[0] || input_tensor->dims[1] != output_tensor->dims[1] || input_tensor->dims[2] != output_tensor->dims[2] || input_tensor->dims[3] != output_tensor->dims[3])
         ret = set_ir_tensor_shape(output_tensor, input_tensor->dims, input_tensor->dim_num);
 
     return ret;
@@ -112,12 +111,12 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops hcl_node_ops = {.prerun = NULL,
-        .run = run,
-        .reshape = reshape,
-        .postrun = NULL,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score};
+                                       .run = run,
+                                       .reshape = reshape,
+                                       .postrun = NULL,
+                                       .init_node = init_node,
+                                       .release_node = release_node,
+                                       .score = score};
 
 int register_softmax_ref_op()
 {
diff --git a/source/device/cpu/op/softplus/softplus_ref.c b/source/device/cpu/op/softplus/softplus_ref.c
index efb41cfc3..6931ab047 100644
--- a/source/device/cpu/op/softplus/softplus_ref.c
+++ b/source/device/cpu/op/softplus/softplus_ref.c
@@ -86,8 +86,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-	int ret = -1;
-    if(input_tensor->data_type == TENGINE_DT_FP32)
+    int ret = -1;
+    if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_softplus_fp32(input_tensor, output_tensor, exec_graph->num_thread);
     else
         printf("Input data type %d not to be supported.\n", input_tensor->data_type);
@@ -112,14 +112,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops hcl_node_ops = {
-        .prerun = NULL,
-        .run = run,
-        .reshape = reshape,
-        .postrun = NULL,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score
-};
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score};
 
 int register_softplus_ref_op()
 {
diff --git a/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c b/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c
index 0c7843831..6a0aa26a4 100644
--- a/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c
+++ b/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int ref_spacetobatchnd_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
                                    struct spacetobatchnd_param* param, int num_thread)
 {
@@ -92,22 +91,16 @@ static int ref_spacetobatchnd_fp32(struct tensor* input_tensor, struct tensor* o
             {
                 for (int out_w = 0; out_w < output_width; ++out_w)
                 {
-                    float* out =
-                        out_data + out_b * out_stride_batch + c * out_stride_depth + out_h * out_stride_height + out_w;
+                    float* out = out_data + out_b * out_stride_batch + c * out_stride_depth + out_h * out_stride_height + out_w;
 
-                    if (out_h * block_shape_height + shift_h < padding_top ||
-                        out_h * block_shape_height + shift_h >= padding_top + input_height ||
-                        out_w * block_shape_width + shift_w < padding_left ||
-                        out_w * block_shape_width + shift_w >= padding_left + input_width)
+                    if (out_h * block_shape_height + shift_h < padding_top || out_h * block_shape_height + shift_h >= padding_top + input_height || out_w * block_shape_width + shift_w < padding_left || out_w * block_shape_width + shift_w >= padding_left + input_width)
                     {
                         // This may not execute correctly when pad_value != 0 and T != uint8.
                         *out = 0;
                     }
                     else
                     {
-                        const float* in = in_data + input_batch * in_stride_batch + c * in_stride_depth +
-                                          ((out_h * block_shape_height + shift_h) - padding_top) * in_stride_height +
-                                          ((out_w * block_shape_width + shift_w) - padding_left);
+                        const float* in = in_data + input_batch * in_stride_batch + c * in_stride_depth + ((out_h * block_shape_height + shift_h) - padding_top) * in_stride_height + ((out_w * block_shape_width + shift_w) - padding_left);
                         *out = *in;
                     }
                 }
@@ -119,7 +112,7 @@ static int ref_spacetobatchnd_fp32(struct tensor* input_tensor, struct tensor* o
 }
 
 static int ref_spacetobatchnd_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
-                                   struct spacetobatchnd_param* param, int num_thread)
+                                    struct spacetobatchnd_param* param, int num_thread)
 {
     /* dequant */
     uint8_t* input_uint8 = (uint8_t*)input_tensor->data;
@@ -131,12 +124,12 @@ static int ref_spacetobatchnd_uint8(struct tensor* input_tensor, struct tensor*
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* in_data = ( float* )sys_malloc(input_size * sizeof(float));
-    float* out_data = ( float* )sys_malloc(output_size * sizeof(float));
+    float* in_data = (float*)sys_malloc(input_size * sizeof(float));
+    float* out_data = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        in_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        in_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     int out_dims[4];
@@ -188,22 +181,16 @@ static int ref_spacetobatchnd_uint8(struct tensor* input_tensor, struct tensor*
             {
                 for (int out_w = 0; out_w < output_width; ++out_w)
                 {
-                    float* out =
-                        out_data + out_b * out_stride_batch + c * out_stride_depth + out_h * out_stride_height + out_w;
+                    float* out = out_data + out_b * out_stride_batch + c * out_stride_depth + out_h * out_stride_height + out_w;
 
-                    if (out_h * block_shape_height + shift_h < padding_top ||
-                        out_h * block_shape_height + shift_h >= padding_top + input_height ||
-                        out_w * block_shape_width + shift_w < padding_left ||
-                        out_w * block_shape_width + shift_w >= padding_left + input_width)
+                    if (out_h * block_shape_height + shift_h < padding_top || out_h * block_shape_height + shift_h >= padding_top + input_height || out_w * block_shape_width + shift_w < padding_left || out_w * block_shape_width + shift_w >= padding_left + input_width)
                     {
                         // This may not execute correctly when pad_value != 0 and T != uint8.
                         *out = 0;
                     }
                     else
                     {
-                        const float* in = in_data + input_batch * in_stride_batch + c * in_stride_depth +
-                                          ((out_h * block_shape_height + shift_h) - padding_top) * in_stride_height +
-                                          ((out_w * block_shape_width + shift_w) - padding_left);
+                        const float* in = in_data + input_batch * in_stride_batch + c * in_stride_depth + ((out_h * block_shape_height + shift_h) - padding_top) * in_stride_height + ((out_w * block_shape_width + shift_w) - padding_left);
                         *out = *in;
                     }
                 }
@@ -247,12 +234,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct spacetobatchnd_param* spacetobatchnd_param = ( struct spacetobatchnd_param* )ir_node->op.param_mem;
+    struct spacetobatchnd_param* spacetobatchnd_param = (struct spacetobatchnd_param*)ir_node->op.param_mem;
 
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ref_spacetobatchnd_fp32(input_tensor, output_tensor, spacetobatchnd_param, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
-         ref_spacetobatchnd_uint8(input_tensor, output_tensor, spacetobatchnd_param, exec_graph->num_thread);
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
+        ref_spacetobatchnd_uint8(input_tensor, output_tensor, spacetobatchnd_param, exec_graph->num_thread);
 
     return 0;
 }
diff --git a/source/device/cpu/op/spacetodepth/spacetodepth_ref.c b/source/device/cpu/op/spacetodepth/spacetodepth_ref.c
index bcec8c2d7..aa8217929 100644
--- a/source/device/cpu/op/spacetodepth/spacetodepth_ref.c
+++ b/source/device/cpu/op/spacetodepth/spacetodepth_ref.c
@@ -32,10 +32,8 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 #include <math.h>
 
-
 int ref_spacetodepth_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     float* input_data = (float*)input_tensor->data;
@@ -93,7 +91,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_spacetodepth_fp32(input_tensor, output_tensor, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_spacetodepth_uint8(input_tensor, output_tensor, exec_graph->num_thread);
 
     return ret;
diff --git a/source/device/cpu/op/sparsetodense/sparsetodense_ref.c b/source/device/cpu/op/sparsetodense/sparsetodense_ref.c
index ea05f1a48..6179ad14c 100644
--- a/source/device/cpu/op/sparsetodense/sparsetodense_ref.c
+++ b/source/device/cpu/op/sparsetodense/sparsetodense_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 int ref_sparsetodense_fp32(struct tensor* input_tensor, struct tensor* output_shape_tensor,
                            struct tensor* sparse_values_tensor, struct tensor* output_tensor,
                            struct sparsetodense_param* param, int num_thread)
@@ -166,7 +165,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     sparse_values_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct sparsetodense_param* sparsetodense_param = ( struct sparsetodense_param* )ir_node->op.param_mem;
+    struct sparsetodense_param* sparsetodense_param = (struct sparsetodense_param*)ir_node->op.param_mem;
 
     int ret = ref_sparsetodense_fp32(input_tensor, output_shape_tensor, sparse_values_tensor, output_tensor,
                                      sparsetodense_param, exec_graph->num_thread);
diff --git a/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c b/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c
index cbe6cedf9..dfd4e730c 100644
--- a/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c
+++ b/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c
@@ -36,63 +36,70 @@
 
 #include <math.h>
 
-int between(float value, float lowerBound, float upperBound){
-    if(value >= lowerBound && value <= upperBound){
+int between(float value, float lowerBound, float upperBound)
+{
+    if (value >= lowerBound && value <= upperBound)
+    {
         return 1;
-    } else {
+    }
+    else
+    {
         return 0;
     }
 }
 
-int BilinearSampling(int o_n, int o_c, int o_h, int o_w, int i_c, int i_h, int i_w, float* in_data, float* out_data, float* grid_total){
-
+int BilinearSampling(int o_n, int o_c, int o_h, int o_w, int i_c, int i_h, int i_w, float* in_data, float* out_data, float* grid_total)
+{
     float* tmp_out = out_data;
-    for(int n = 0; n < o_n; n++){
-        for(int c = 0; c < o_c; c++){
-            for(int h = 0; h < o_h; h++){
-                for(int w = 0; w < o_w; w++){
-                    int out_index = n*o_c*o_h*o_w + c*o_h*o_w + h*o_w + w;
-                    int grid_index = n*o_h*o_w*2 + h*o_w + w;
-                    float y_real = (*(grid_total + grid_index + o_h*o_w) + 1.0) * (i_h-1.0)/2.0;
-                    float x_real = (*(grid_total + grid_index)+1.0)*(i_w - 1.0)/2.0;
+    for (int n = 0; n < o_n; n++)
+    {
+        for (int c = 0; c < o_c; c++)
+        {
+            for (int h = 0; h < o_h; h++)
+            {
+                for (int w = 0; w < o_w; w++)
+                {
+                    int out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+                    int grid_index = n * o_h * o_w * 2 + h * o_w + w;
+                    float y_real = (*(grid_total + grid_index + o_h * o_w) + 1.0) * (i_h - 1.0) / 2.0;
+                    float x_real = (*(grid_total + grid_index) + 1.0) * (i_w - 1.0) / 2.0;
                     int top_left_y = floor(y_real);
                     int top_left_x = floor(x_real);
                     float top_left_y_w = 1.0 - (y_real - top_left_y);
                     float top_left_x_w = 1.0 - (x_real - top_left_x);
-                    int data_index = n*i_c*i_h*i_w + c*i_h*i_w + top_left_y * i_w + top_left_x;
+                    int data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
                     float top_left_v = 0;
                     float top_right_v = 0;
                     float bottom_left_v = 0;
                     float bottom_right_v = 0;
                     int lower_bound = 0;
-                    if (between(top_left_x, lower_bound, i_w-1) &&  between(top_left_y, lower_bound, i_h-1)){
-                            top_left_v = *(in_data + data_index);
-                        }
-                    if (between(top_left_x + 1, lower_bound, i_w-1) && between(top_left_y, lower_bound, i_h-1)){
-                            top_right_v = *(in_data + data_index + 1);
-                        }
-                    if (between(top_left_x, lower_bound, i_w-1) && between(top_left_y + 1, lower_bound, i_h-1)){
-                            bottom_left_v = *(in_data + data_index + i_w);
-                        }
-                    if (between(top_left_x+1, lower_bound, i_w-1) && between(top_left_y + 1, lower_bound, i_h-1)){
-                            bottom_right_v = *(in_data + data_index + i_w + 1);
-                        }
-                    *(tmp_out+out_index) = top_left_v * top_left_y_w * top_left_x_w +
-                                            top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
-                                            bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
-                                            bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
+                    if (between(top_left_x, lower_bound, i_w - 1) && between(top_left_y, lower_bound, i_h - 1))
+                    {
+                        top_left_v = *(in_data + data_index);
+                    }
+                    if (between(top_left_x + 1, lower_bound, i_w - 1) && between(top_left_y, lower_bound, i_h - 1))
+                    {
+                        top_right_v = *(in_data + data_index + 1);
+                    }
+                    if (between(top_left_x, lower_bound, i_w - 1) && between(top_left_y + 1, lower_bound, i_h - 1))
+                    {
+                        bottom_left_v = *(in_data + data_index + i_w);
                     }
-                    
+                    if (between(top_left_x + 1, lower_bound, i_w - 1) && between(top_left_y + 1, lower_bound, i_h - 1))
+                    {
+                        bottom_right_v = *(in_data + data_index + i_w + 1);
+                    }
+                    *(tmp_out + out_index) = top_left_v * top_left_y_w * top_left_x_w + top_right_v * top_left_y_w * (1.0 - top_left_x_w) + bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w + bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
+                }
             }
         }
     }
     return 1;
 }
 
-int ref_spatialtransformer_fp32(struct tensor* input_tensor,struct tensor* input_tensor1, struct tensor* output_tensor,
-                           struct spatialtransformer_param* param, int num_thread)
+int ref_spatialtransformer_fp32(struct tensor* input_tensor, struct tensor* input_tensor1, struct tensor* output_tensor,
+                                struct spatialtransformer_param* param, int num_thread)
 {
-    
     int indices_dim_size = input_tensor->dim_num;
 
     float* in_data = (float*)input_tensor->data;
@@ -101,40 +108,48 @@ int ref_spatialtransformer_fp32(struct tensor* input_tensor,struct tensor* input
 
     int batch = input_tensor->dims[1];
 
-    float* workspace = (float*)malloc(sizeof(float)*3*param->target_shape[0]*param->target_shape[1]);
+    float* workspace = (float*)malloc(sizeof(float) * 3 * param->target_shape[0] * param->target_shape[1]);
 
-    int target_shape_hw = param->target_shape[0]*param->target_shape[1];
-    for(int i = 1; i <= target_shape_hw; i++){
-        workspace[0*target_shape_hw + i-1] = -1.0 + (i-1) % param->target_shape[1] * 2.0 / (param->target_shape[1] - 1);
-        workspace[1*target_shape_hw + i-1] = -1.0 + (i-1) / param->target_shape[1] * 2.0 / (param->target_shape[0] - 1);
-        workspace[2*target_shape_hw + i-1] = 1.0;
+    int target_shape_hw = param->target_shape[0] * param->target_shape[1];
+    for (int i = 1; i <= target_shape_hw; i++)
+    {
+        workspace[0 * target_shape_hw + i - 1] = -1.0 + (i - 1) % param->target_shape[1] * 2.0 / (param->target_shape[1] - 1);
+        workspace[1 * target_shape_hw + i - 1] = -1.0 + (i - 1) / param->target_shape[1] * 2.0 / (param->target_shape[0] - 1);
+        workspace[2 * target_shape_hw + i - 1] = 1.0;
     }
     int m = 2;
     int p = target_shape_hw;
     int n = 3;
 
-    float* grid_src = (float*)malloc(sizeof(float)*2*target_shape_hw*batch);
-    float* grid_dst = (float*)malloc(sizeof(float)*3*target_shape_hw);
-    
-    for(int i = 0; i < 3*target_shape_hw; i++){
+    float* grid_src = (float*)malloc(sizeof(float) * 2 * target_shape_hw * batch);
+    float* grid_dst = (float*)malloc(sizeof(float) * 3 * target_shape_hw);
+
+    for (int i = 0; i < 3 * target_shape_hw; i++)
+    {
         grid_dst[i] = workspace[i];
     }
-    if(param->transformer_type == 0){   // Affine
-        for(int b = 0; b < batch; b++){
+    if (param->transformer_type == 0)
+    { // Affine
+        for (int b = 0; b < batch; b++)
+        {
             int index = b * target_shape_hw;
             float* grid_src_batch = grid_src + 0;
-            for(int i = 0; i < m; i++){
-                for(int j = 0; j < target_shape_hw; j++){
-                    grid_src_batch[i*p + j] = 0;
-                    for(int a = 1; a <= n; a++){
-                        grid_src_batch[i*p + j] += loc_data[i*n + a - 1] * grid_dst[(a-1)*p + j]; 
+            for (int i = 0; i < m; i++)
+            {
+                for (int j = 0; j < target_shape_hw; j++)
+                {
+                    grid_src_batch[i * p + j] = 0;
+                    for (int a = 1; a <= n; a++)
+                    {
+                        grid_src_batch[i * p + j] += loc_data[i * n + a - 1] * grid_dst[(a - 1) * p + j];
                     }
                 }
             }
         }
     }
-        
-    if (param->sampler_type == 1) {  // Bilinear 
+
+    if (param->sampler_type == 1)
+    { // Bilinear
         int o_n = output_tensor->dims[0];
         int o_c = output_tensor->dims[1];
         int o_h = output_tensor->dims[2];
@@ -142,14 +157,16 @@ int ref_spatialtransformer_fp32(struct tensor* input_tensor,struct tensor* input
         int i_c = input_tensor->dims[1];
         int i_h = input_tensor->dims[2];
         int i_w = input_tensor->dims[3];
-        int ret=BilinearSampling(o_n, o_c, o_h, o_w, i_c, i_h, i_w, in_data, out_data, grid_src);
-    } else {
+        int ret = BilinearSampling(o_n, o_c, o_h, o_w, i_c, i_h, i_w, in_data, out_data, grid_src);
+    }
+    else
+    {
         TLOG_ERR("Extra type not support yet\n");
     }
 
     free(grid_src);
     free(grid_dst);
-    free(workspace);   
+    free(workspace);
     return 0;
 }
 
@@ -183,10 +200,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor1 = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     int indices_dim_size = input_tensor1->dim_num;
 
-    struct spatialtransformer_param* spatialtransformer_param = ( struct spatialtransformer_param* )ir_node->op.param_mem;
+    struct spatialtransformer_param* spatialtransformer_param = (struct spatialtransformer_param*)ir_node->op.param_mem;
 
-    int ret = ref_spatialtransformer_fp32(input_tensor,input_tensor1, output_tensor,
-                                     spatialtransformer_param, exec_graph->num_thread);
+    int ret = ref_spatialtransformer_fp32(input_tensor, input_tensor1, output_tensor,
+                                          spatialtransformer_param, exec_graph->num_thread);
     if (ret != 0)
         return -1;
 
diff --git a/source/device/cpu/op/split/split_ref.c b/source/device/cpu/op/split/split_ref.c
index a79f2613f..2a7fa2890 100644
--- a/source/device/cpu/op/split/split_ref.c
+++ b/source/device/cpu/op/split/split_ref.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 int ref_split_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct split_param* split_param, int* slice_index, int num_slices, int slice_size, int in_slice, int slice_axis)
 {
     float* input_data = (float*)input_tensor->data;
@@ -62,8 +61,8 @@ int ref_split_fp32(struct tensor* input_tensor, struct tensor* output_tensor, st
 
         *slice_index += out_slice;
     }
-	
-	return 0;
+
+    return 0;
 }
 
 int ref_split_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct split_param* split_param, int* slice_index, int num_slices, int slice_size, int in_slice, int slice_axis)
@@ -90,8 +89,8 @@ int ref_split_uint8(struct tensor* input_tensor, struct tensor* output_tensor, s
 
         *slice_index += out_slice;
     }
-	
-	return 0;
+
+    return 0;
 }
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
@@ -118,7 +117,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     // output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct split_param* split_param = ( struct split_param* )ir_node->op.param_mem;
+    struct split_param* split_param = (struct split_param*)ir_node->op.param_mem;
 
     /* the follow codes need to be checked ! */
     int slice_axis = split_param->axis;
@@ -135,15 +134,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int slice_index = 0;
     int out_num = ir_node->output_num;
 
-	int ret = -1;
+    int ret = -1;
     for (int i = 0; i < out_num; i++)
     {
         struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[i]);
-        
+
         if (input_tensor->data_type == TENGINE_DT_FP32)
             ret = ref_split_fp32(input_tensor, output_tensor, split_param, &slice_index, num_slices, slice_size, in_slice, slice_axis);
-       	else if(input_tensor->data_type == TENGINE_DT_UINT8)
-           	ret = ref_split_uint8(input_tensor, output_tensor, split_param, &slice_index, num_slices, slice_size, in_slice, slice_axis);
+        else if (input_tensor->data_type == TENGINE_DT_UINT8)
+            ret = ref_split_uint8(input_tensor, output_tensor, split_param, &slice_index, num_slices, slice_size, in_slice, slice_axis);
     }
 
     return ret;
diff --git a/source/device/cpu/op/squareddifference/squareddifference_ref.c b/source/device/cpu/op/squareddifference/squareddifference_ref.c
index 6fc416891..66a600291 100644
--- a/source/device/cpu/op/squareddifference/squareddifference_ref.c
+++ b/source/device/cpu/op/squareddifference/squareddifference_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 int ref_squareddifference_fp32(struct tensor* input_tensor_0, struct tensor* input_tensor_1,
                                struct tensor* output_tensor, int num_thread)
 {
@@ -86,7 +85,7 @@ int ref_squareddifference_fp32(struct tensor* input_tensor_0, struct tensor* inp
 }
 
 int ref_squareddifference_uint8(struct tensor* input_tensor_0, struct tensor* input_tensor_1,
-                               struct tensor* output_tensor, int num_thread)
+                                struct tensor* output_tensor, int num_thread)
 {
     /* dequant */
     uint8_t* input0_uint8 = (uint8_t*)input_tensor_0->data;
@@ -102,17 +101,17 @@ int ref_squareddifference_uint8(struct tensor* input_tensor_0, struct tensor* in
     int input1_size = input_tensor_1->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* input0 = ( float* )sys_malloc(input0_size * sizeof(float));
-    float* input1 = ( float* )sys_malloc(input1_size * sizeof(float));
-    float* output = ( float* )sys_malloc(output_size * sizeof(float));
+    float* input0 = (float*)sys_malloc(input0_size * sizeof(float));
+    float* input1 = (float*)sys_malloc(input1_size * sizeof(float));
+    float* output = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input0_size; i++)
     {
-        input0[i] = (( float )input0_uint8[i] - ( float )input0_zero) * input0_scale;
+        input0[i] = ((float)input0_uint8[i] - (float)input0_zero) * input0_scale;
     }
     for (int i = 0; i < input1_size; i++)
     {
-        input1[i] = (( float )input1_uint8[i] - ( float )input1_zero) * input1_scale;
+        input1[i] = ((float)input1_uint8[i] - (float)input1_zero) * input1_scale;
     }
 
     // dims size = 2 or 3
@@ -201,7 +200,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int ret = -1;
     if (input_tensor_0->data_type == TENGINE_DT_FP32)
         ret = ref_squareddifference_fp32(input_tensor_0, input_tensor_1, output_tensor, exec_graph->num_thread);
-    else if(input_tensor_0->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor_0->data_type == TENGINE_DT_UINT8)
         ret = ref_squareddifference_uint8(input_tensor_0, input_tensor_1, output_tensor, exec_graph->num_thread);
 
     return ret;
diff --git a/source/device/cpu/op/squeeze/squeeze_ref.c b/source/device/cpu/op/squeeze/squeeze_ref.c
index 7550bdb25..1928d299e 100644
--- a/source/device/cpu/op/squeeze/squeeze_ref.c
+++ b/source/device/cpu/op/squeeze/squeeze_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 int ref_squeeze_fp32(struct tensor* input_tensor, struct tensor* output_tensor)
 {
     float* input_data = (float*)input_tensor->data;
@@ -80,10 +79,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_squeeze_fp32(input_tensor, output_tensor);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_squeeze_uint8(input_tensor, output_tensor);
 
     return ret;
diff --git a/source/device/cpu/op/strided_slice/strided_slice_ref.c b/source/device/cpu/op/strided_slice/strided_slice_ref.c
index 1f0297187..bb3cb9111 100644
--- a/source/device/cpu/op/strided_slice/strided_slice_ref.c
+++ b/source/device/cpu/op/strided_slice/strided_slice_ref.c
@@ -37,7 +37,6 @@
 
 #include <math.h>
 
-
 int ref_strided_slice_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct strided_slice_param* param)
 {
     int batch_num = input_tensor->dims[0];
@@ -65,10 +64,7 @@ int ref_strided_slice_fp32(struct tensor* input_tensor, struct tensor* output_te
             {
                 for (int w = 0; w < out_w; w++)
                 {
-                    int input_index = (param->begin[0] + n * param->stride[0]) * in_chw +
-                                      (param->begin[1] + c * param->stride[1]) * in_hw +
-                                      (param->begin[2] + h * param->stride[2]) * in_w +
-                                      (param->begin[3] + w * param->stride[3]);
+                    int input_index = (param->begin[0] + n * param->stride[0]) * in_chw + (param->begin[1] + c * param->stride[1]) * in_hw + (param->begin[2] + h * param->stride[2]) * in_w + (param->begin[3] + w * param->stride[3]);
                     int output_index = n * out_chw + c * out_hw + h * out_w + w;
 
                     output_data[output_index] = input_data[input_index];
@@ -107,10 +103,7 @@ int ref_strided_slice_uint8(struct tensor* input_tensor, struct tensor* output_t
             {
                 for (int w = 0; w < out_w; w++)
                 {
-                    int input_index = (param->begin[0] + n * param->stride[0]) * in_chw +
-                                      (param->begin[1] + c * param->stride[1]) * in_hw +
-                                      (param->begin[2] + h * param->stride[2]) * in_w +
-                                      (param->begin[3] + w * param->stride[3]);
+                    int input_index = (param->begin[0] + n * param->stride[0]) * in_chw + (param->begin[1] + c * param->stride[1]) * in_hw + (param->begin[2] + h * param->stride[2]) * in_w + (param->begin[3] + w * param->stride[3]);
                     int output_index = n * out_chw + c * out_hw + h * out_w + w;
 
                     output_data[output_index] = input_data[input_index];
@@ -144,12 +137,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct strided_slice_param* param = ( struct strided_slice_param* )ir_node->op.param_mem;
+    struct strided_slice_param* param = (struct strided_slice_param*)ir_node->op.param_mem;
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_strided_slice_fp32(input_tensor, output_tensor, param);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_strided_slice_uint8(input_tensor, output_tensor, param);
 
     return ret;
diff --git a/source/device/cpu/op/swap_axis/swap_axis_ref.c b/source/device/cpu/op/swap_axis/swap_axis_ref.c
index 81b21e328..6aeef17bb 100644
--- a/source/device/cpu/op/swap_axis/swap_axis_ref.c
+++ b/source/device/cpu/op/swap_axis/swap_axis_ref.c
@@ -37,21 +37,18 @@
 #include <math.h>
 #include <string.h>
 
-
 static int ref_swap_axis_common(struct tensor* input_tensor, struct tensor* output_tensor, const int* dims, int element_size)
 {
-    const float* in_data = ( float* )input_tensor->data;
-    float* out_data = ( float* )output_tensor->data;
+    const float* in_data = (float*)input_tensor->data;
+    float* out_data = (float*)output_tensor->data;
 
     for (int i = 0; i < dims[0]; i++)
         for (int j = 0; j < dims[3]; j++)
             for (int p = 0; p < dims[2]; p++)
                 for (int q = 0; q < dims[1]; q++)
                 {
-                    int out_index = i * dims[1] * dims[2] * dims[3] * dims[4] + j * dims[2] * dims[1] * dims[4] +
-                                    p * dims[1] * dims[4] + q * dims[4];
-                    int in_index = i * dims[1] * dims[2] * dims[3] * dims[4] + q * dims[2] * dims[3] * dims[4] +
-                                   p * dims[3] * dims[4] + j * dims[4];
+                    int out_index = i * dims[1] * dims[2] * dims[3] * dims[4] + j * dims[2] * dims[1] * dims[4] + p * dims[1] * dims[4] + q * dims[4];
+                    int in_index = i * dims[1] * dims[2] * dims[3] * dims[4] + q * dims[2] * dims[3] * dims[4] + p * dims[3] * dims[4] + j * dims[4];
                     memcpy(out_data + out_index * element_size, in_data + in_index * element_size,
                            (size_t)dims[4] * element_size);
                 }
@@ -60,18 +57,16 @@ static int ref_swap_axis_common(struct tensor* input_tensor, struct tensor* outp
 
 static int ref_swap_axis_uint8(struct tensor* input_tensor, struct tensor* output_tensor, const int* dims, int element_size)
 {
-    const uint8_t* in_data = ( uint8_t* )input_tensor->data;
-    uint8_t* out_data = ( uint8_t* )output_tensor->data;
+    const uint8_t* in_data = (uint8_t*)input_tensor->data;
+    uint8_t* out_data = (uint8_t*)output_tensor->data;
 
     for (int i = 0; i < dims[0]; i++)
         for (int j = 0; j < dims[3]; j++)
             for (int p = 0; p < dims[2]; p++)
                 for (int q = 0; q < dims[1]; q++)
                 {
-                    int out_index = i * dims[1] * dims[2] * dims[3] * dims[4] + j * dims[2] * dims[1] * dims[4] +
-                                    p * dims[1] * dims[4] + q * dims[4];
-                    int in_index = i * dims[1] * dims[2] * dims[3] * dims[4] + q * dims[2] * dims[3] * dims[4] +
-                                   p * dims[3] * dims[4] + j * dims[4];
+                    int out_index = i * dims[1] * dims[2] * dims[3] * dims[4] + j * dims[2] * dims[1] * dims[4] + p * dims[1] * dims[4] + q * dims[4];
+                    int in_index = i * dims[1] * dims[2] * dims[3] * dims[4] + q * dims[2] * dims[3] * dims[4] + p * dims[3] * dims[4] + j * dims[4];
                     memcpy(out_data + out_index * element_size, in_data + in_index * element_size,
                            (size_t)dims[4] * element_size);
                 }
@@ -95,7 +90,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct swap_axis_param* _param = ( struct swap_axis_param* )(ir_node->op.param_mem);
+    struct swap_axis_param* _param = (struct swap_axis_param*)(ir_node->op.param_mem);
     int in_size = 1;
     for (int i = 0; i < input_tensor->dim_num; i++)
     {
@@ -127,10 +122,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     for (int i = dim1 + 1; i < in_size; i++)
         dims[4] *= input_tensor->dims[i];
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_swap_axis_common(input_tensor, output_tensor, dims, sizeof(float));
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_swap_axis_uint8(input_tensor, output_tensor, dims, sizeof(uint8_t));
 
     return ret;
diff --git a/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c b/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c
index ac9bf9b41..de5975df5 100644
--- a/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c
+++ b/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c
@@ -34,7 +34,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     exec_node->inplace_map[0] = 0;
@@ -64,8 +63,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    float* idata = ( float* )input_tensor->data;
-    float* odata = ( float* )output_tensor->data;
+    float* idata = (float*)input_tensor->data;
+    float* odata = (float*)output_tensor->data;
     if (idata != odata)
     {
         TLOG_ERR("input and output are not the same mem\n");
diff --git a/source/device/cpu/op/tanh/cortex-a/tanh_kernel_arm.c b/source/device/cpu/op/tanh/cortex-a/tanh_kernel_arm.c
index 10de24f67..813075fc3 100644
--- a/source/device/cpu/op/tanh/cortex-a/tanh_kernel_arm.c
+++ b/source/device/cpu/op/tanh/cortex-a/tanh_kernel_arm.c
@@ -28,7 +28,6 @@
 
 #include <arm_neon.h>
 
-
 #define T_MAX(a, b) ((a) > (b) ? (a) : (b))
 #define T_MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -63,7 +62,7 @@ exp(x) = lim(1+x/n)^n       // n=10
 */
 static inline float32x4_t vexpq10_f32(float32x4_t x)
 {
-    x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f);    // n = 10
+    x = vmlaq_n_f32(vdupq_n_f32(1.0f), x, 0.0009765625f); // n = 10
     x = vmulq_f32(x, x);
     x = vmulq_f32(x, x);
     x = vmulq_f32(x, x);
@@ -79,7 +78,7 @@ static inline float32x4_t vexpq10_f32(float32x4_t x)
 
 static void tanh_kernel(int i, int id, void* data, const float* input, float* output)
 {
-    int step = (( int* )data)[0];
+    int step = ((int*)data)[0];
     float32x4_t min = vdupq_n_f32(-30.0f);
     float32x4_t max = vdupq_n_f32(30.0f);
     const float* cur_input = input + id * step;
@@ -113,8 +112,8 @@ static void tanh_kernel(int i, int id, void* data, const float* input, float* ou
 
 int tanh_run(struct tensor* output_tensor, struct tensor* input_tensor, int num_thread)
 {
-    float* data = ( float* )input_tensor->data;
-    float* out_data = ( float* )output_tensor->data;
+    float* data = (float*)input_tensor->data;
+    float* out_data = (float*)output_tensor->data;
 
     int chan_num = (input_tensor->dims[0]) * (input_tensor->dims[1]);
     int chan_size = (input_tensor->dims[2]) * (input_tensor->dims[3]);
diff --git a/source/device/cpu/op/tanh/tanh_ref.c b/source/device/cpu/op/tanh/tanh_ref.c
index a9236fb66..390f64332 100644
--- a/source/device/cpu/op/tanh/tanh_ref.c
+++ b/source/device/cpu/op/tanh/tanh_ref.c
@@ -35,7 +35,6 @@
 
 #include <math.h>
 
-
 int ref_tanh_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     float* input_data = (float*)input_tensor->data;
@@ -61,12 +60,12 @@ int ref_tanh_uint8(struct tensor* input_tensor, struct tensor* output_tensor, in
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float));
-    float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float));
+    float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float));
+    float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        input_fp32[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     for (int i = 0; i < input_size; i++)
@@ -108,10 +107,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_tanh_fp32(input_tensor, output_tensor, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_tanh_uint8(input_tensor, output_tensor, exec_graph->num_thread);
 
     return ret;
diff --git a/source/device/cpu/op/threshold/threshold_ref.c b/source/device/cpu/op/threshold/threshold_ref.c
index 60013623e..4672086a5 100644
--- a/source/device/cpu/op/threshold/threshold_ref.c
+++ b/source/device/cpu/op/threshold/threshold_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -64,12 +63,12 @@ int ref_threshold_uint8(struct tensor* input_tensor, struct tensor* output_tenso
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float));
-    float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float));
+    float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float));
+    float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        input_fp32[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     for (int i = 0; i < size; i++)
@@ -98,7 +97,7 @@ int ref_threshold_fp32(struct tensor* input_tensor, struct tensor* output_tensor
 {
     float* input_data = (float*)input_tensor->data;
     float* out_data = (float*)output_tensor->data;
-    
+
     for (int i = 0; i < size; i++)
     {
         out_data[i] = input_data[i] > threshold ? 1.f : 0.f;
@@ -115,12 +114,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct threshold_param* param = ( struct threshold_param* )node->op.param_mem;
+    struct threshold_param* param = (struct threshold_param*)node->op.param_mem;
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_threshold_fp32(input_tensor, output_tensor, param->threshold, output_tensor->elem_num, 1.0f, 0.0f);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_threshold_uint8(input_tensor, output_tensor, param->threshold, output_tensor->elem_num, 1.0f, 0.0f);
 
     return ret;
diff --git a/source/device/cpu/op/tile/tile_ref.c b/source/device/cpu/op/tile/tile_ref.c
index 0397e8772..0f51a5310 100644
--- a/source/device/cpu/op/tile/tile_ref.c
+++ b/source/device/cpu/op/tile/tile_ref.c
@@ -37,7 +37,6 @@
 
 #include <math.h>
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -52,26 +51,25 @@ static int ref_tile_fp32(float* data, float* output, int* repeat, int* inDim, in
 {
     int index = 0;
 
-    if(flag == 0)   // caffe
+    if (flag == 0) // caffe
     {
-        for(int in = 0; in < inDim[0]; in++)
+        for (int in = 0; in < inDim[0]; in++)
         {
-            for(int rn = 0; rn < repeat[3]; rn++)
+            for (int rn = 0; rn < repeat[3]; rn++)
             {
-                for(int ic = 0; ic < inDim[1]; ic++)
+                for (int ic = 0; ic < inDim[1]; ic++)
                 {
-                    for(int rc = 0; rc < repeat[2]; rc++)
+                    for (int rc = 0; rc < repeat[2]; rc++)
                     {
-                        for(int ih = 0; ih < inDim[2]; ih++)
+                        for (int ih = 0; ih < inDim[2]; ih++)
                         {
-                            for(int rh = 0; rh < repeat[1]; rh++)
+                            for (int rh = 0; rh < repeat[1]; rh++)
                             {
-                                for(int iw = 0; iw < inDim[3]; iw++)
+                                for (int iw = 0; iw < inDim[3]; iw++)
                                 {
-                                    for(int rw = 0; rw < repeat[0]; rw++)
+                                    for (int rw = 0; rw < repeat[0]; rw++)
                                     {
-                                        int inDataSize = in * inDim[1] * inDim[2] * inDim[3] + ic * inDim[2] * inDim[3] +
-                                                         ih * inDim[3] + iw;
+                                        int inDataSize = in * inDim[1] * inDim[2] * inDim[3] + ic * inDim[2] * inDim[3] + ih * inDim[3] + iw;
                                         output[index] = data[inDataSize];
                                         index++;
                                     }
@@ -83,7 +81,7 @@ static int ref_tile_fp32(float* data, float* output, int* repeat, int* inDim, in
             }
         }
     }
-    else if(flag == 1)  // onnx
+    else if (flag == 1) // onnx
     {
         int n = inDim[0];
         int c = inDim[1];
@@ -94,15 +92,15 @@ static int ref_tile_fp32(float* data, float* output, int* repeat, int* inDim, in
         int rh = repeat[1];
         int rw = repeat[0];
 
-        int n1 = n*rn;
-        int c1 = c*rc;
-        int h1 = h*rh;
-        int w1 = w*rw;
+        int n1 = n * rn;
+        int c1 = c * rc;
+        int h1 = h * rh;
+        int w1 = w * rw;
 
-        int size = outDim[0]*outDim[1]*outDim[2]*outDim[3];
+        int size = outDim[0] * outDim[1] * outDim[2] * outDim[3];
         for (int i = 0; i < size; ++i)
         {
-            index = i / (c1*h1*w1) % n * (c*h*w) + i % (c1*h1*w1) / (h1*w1) % c * (h*w) + i % (h1*w1) / w1 % h * w + i % w1 % w;
+            index = i / (c1 * h1 * w1) % n * (c * h * w) + i % (c1 * h1 * w1) / (h1 * w1) % c * (h * w) + i % (h1 * w1) / w1 % h * w + i % w1 % w;
             output[i] = data[index];
         }
     }
@@ -135,29 +133,29 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int size = 0;
     int default_value = 1;
 
-    if(frame_flag == 0)
+    if (frame_flag == 0)
     {
         size = param->reps_size;
-        for(int i = 0; i < 4 - size; i++)
+        for (int i = 0; i < 4 - size; i++)
         {
             push_vector_data(repeat, (void*)&default_value);
         }
     }
-    else if ( frame_flag == 1)
+    else if (frame_flag == 1)
     {
-        size = input_reps_shape[0]*input_reps_shape[1]*input_reps_shape[2]*input_reps_shape[3];
-        for(int i = 0; i < size; i++)
+        size = input_reps_shape[0] * input_reps_shape[1] * input_reps_shape[2] * input_reps_shape[3];
+        for (int i = 0; i < size; i++)
         {
             push_vector_data(repeat, (void*)&input_reps[i]);
         }
-        for(int i = 0; i < 4 - size; i++)
+        for (int i = 0; i < 4 - size; i++)
         {
             push_vector_data(repeat, (void*)&default_value);
         }
     }
 
-    int* repeat_data = (int*)sys_malloc(get_vector_num(repeat)*sizeof(int));
-    for(int i = 0; i < get_vector_num(repeat); i++)
+    int* repeat_data = (int*)sys_malloc(get_vector_num(repeat) * sizeof(int));
+    for (int i = 0; i < get_vector_num(repeat); i++)
     {
         int* a = (int*)get_vector_data(repeat, i);
         repeat_data[i] = *a;
@@ -176,15 +174,13 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 }
 
 static struct node_ops hcl_node_ops = {
-        .prerun = prerun,
-        .run = run,
-        .reshape = NULL,
-        .postrun = NULL,
-        .init_node = init_node,
-        .release_node = release_node,
-        .score = score
-};
-
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score};
 
 int register_tile_ref_op()
 {
diff --git a/source/device/cpu/op/topkv2/topkv2_ref.c b/source/device/cpu/op/topkv2/topkv2_ref.c
index 73054192e..b84cc2433 100644
--- a/source/device/cpu/op/topkv2/topkv2_ref.c
+++ b/source/device/cpu/op/topkv2/topkv2_ref.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 struct topkv2_param_ref
 {
     int k;
@@ -108,7 +107,7 @@ static int ref_topkv2_fp32(float* in_data, float* out_data, int* out_index, stru
 
     int row_size = param->row_size;
     int num_rows = param->num_rows;
-    int* index = ( int* )sys_malloc(row_size * sizeof(int));
+    int* index = (int*)sys_malloc(row_size * sizeof(int));
 
     for (int i = 0; i < num_rows; ++i)
     {
@@ -137,18 +136,18 @@ static int ref_topkv2_uint8(struct tensor* input_tensor, struct tensor* output_t
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* in_data = ( float* )sys_malloc(input_size * sizeof(float));
-    float* out_data = ( float* )sys_malloc(output_size * sizeof(float));
+    float* in_data = (float*)sys_malloc(input_size * sizeof(float));
+    float* out_data = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        in_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        in_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     int k = param->k;
     int row_size = param->row_size;
     int num_rows = param->num_rows;
-    int* index = ( int* )sys_malloc(row_size * sizeof(int));
+    int* index = (int*)sys_malloc(row_size * sizeof(int));
 
     for (int i = 0; i < num_rows; ++i)
     {
@@ -162,7 +161,7 @@ static int ref_topkv2_uint8(struct tensor* input_tensor, struct tensor* output_t
         memcpy(&out_index[i * k], index, k * sizeof(float));
         sys_free(index);
     }
-    
+
     /* quant */
     for (int i = 0; i < output_size; i++)
     {
@@ -199,10 +198,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct topkv2_param* _param = ( struct topkv2_param* )(ir_node->op.param_mem);
+    struct topkv2_param* _param = (struct topkv2_param*)(ir_node->op.param_mem);
     struct tensor* input_tensor;
     int out_nums = ir_node->output_num;
-    struct topkv2_priv_info* topkv2_priv_info = ( struct topkv2_priv_info* )exec_node->ops_priv;
+    struct topkv2_priv_info* topkv2_priv_info = (struct topkv2_priv_info*)exec_node->ops_priv;
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
     struct tensor* output_tensor_1 = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[1]);
@@ -216,13 +215,13 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     op_param.k = _param->k;
     op_param.row_size = input_tensor->dims[dims_len - 1];
     op_param.num_rows = num_rows;
-    float* input = ( float* )input_tensor->data;
-    
+    float* input = (float*)input_tensor->data;
+
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
-        ret = ref_topkv2_fp32(input, ( float* )output_tensor->data, ( int* )output_tensor_1->data, &op_param);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
-        ret = ref_topkv2_uint8(input_tensor, output_tensor, ( int* )output_tensor_1->data, &op_param);
+        ret = ref_topkv2_fp32(input, (float*)output_tensor->data, (int*)output_tensor_1->data, &op_param);
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
+        ret = ref_topkv2_uint8(input_tensor, output_tensor, (int*)output_tensor_1->data, &op_param);
 
     return ret;
 }
diff --git a/source/device/cpu/op/transpose/transpose_ref.c b/source/device/cpu/op/transpose/transpose_ref.c
index 2b030e3ab..ec14fd38f 100644
--- a/source/device/cpu/op/transpose/transpose_ref.c
+++ b/source/device/cpu/op/transpose/transpose_ref.c
@@ -37,7 +37,6 @@
 #include <math.h>
 #include <string.h>
 
-
 struct ref_transpose_param
 {
     int* in_dims;
@@ -58,9 +57,9 @@ void transpose2d(float* input, float* output, const struct ref_transpose_param*
     int stride1 = inStride[param->permute[1]];
 
     for (int n = 0; n < out_dim0; n++)
-    {    // 1
+    { // 1
         for (int h = 0; h < out_dim1; h++)
-        {    // 1
+        { // 1
             output[n * out_dim1 + h] = input[n * stride0 + h * stride1];
         }
     }
@@ -88,11 +87,11 @@ void transpose3d(float* input, float* output, const struct ref_transpose_param*
     int stride2 = inStride[param->permute[2]];
 
     for (int n = 0; n < out_dim0; n++)
-    {    // 1
+    { // 1
         for (int h = 0; h < out_dim1; h++)
-        {    // 1
+        { // 1
             for (int w = 0; w < out_dim2; w++)
-            {    // 2
+            { // 2
                 output[n * outStride0 + h * outStride1 + w] = input[n * stride0 + h * stride1 + w * stride2];
             }
         }
@@ -127,15 +126,14 @@ void transpose4d(float* input, float* output, const struct ref_transpose_param*
     int stride3 = inStride[param->permute[3]];
 
     for (int n = 0; n < out_dim0; n++)
-    {    // 1
+    { // 1
         for (int h = 0; h < out_dim1; h++)
-        {    // 1
+        { // 1
             for (int w = 0; w < out_dim2; w++)
-            {    // 2
+            { // 2
                 for (int c = 0; c < out_dim3; c++)
-                {    // 2
-                    output[n * outStride0 + h * outStride1 + w * outStride2 + c] =
-                        input[n * stride0 + h * stride1 + w * stride2 + c * stride3];
+                { // 2
+                    output[n * outStride0 + h * outStride1 + w * outStride2 + c] = input[n * stride0 + h * stride1 + w * stride2 + c * stride3];
                 }
             }
         }
@@ -173,17 +171,16 @@ void transpose5d(float* input, float* output, const struct ref_transpose_param*
     int stride4 = inStride[param->permute[4]];
 
     for (int n = 0; n < out_dim0; n++)
-    {    // 1
+    { // 1
         for (int h = 0; h < out_dim1; h++)
-        {    // 1
+        { // 1
             for (int w = 0; w < out_dim2; w++)
-            {    // 2
+            { // 2
                 for (int c = 0; c < out_dim3; c++)
-                {    // 2
+                { // 2
                     for (int x = 0; x < out_dim4; x++)
                     {
-                        output[n * outStride0 + h * outStride1 + w * outStride2 + c * outStride3 + x] =
-                            input[n * stride0 + h * stride1 + w * stride2 + c * stride3 + x * stride4];
+                        output[n * outStride0 + h * outStride1 + w * outStride2 + c * outStride3 + x] = input[n * stride0 + h * stride1 + w * stride2 + c * stride3 + x * stride4];
                     }
                 }
             }
@@ -228,20 +225,18 @@ void transpose6d(float* input, float* output, const struct ref_transpose_param*
     int stride5 = inStride[param->permute[5]];
 
     for (int n = 0; n < out_dim0; n++)
-    {    // 1
+    { // 1
         for (int h = 0; h < out_dim1; h++)
-        {    // 1
+        { // 1
             for (int w = 0; w < out_dim2; w++)
-            {    // 2
+            { // 2
                 for (int c = 0; c < out_dim3; c++)
-                {    // 2
+                { // 2
                     for (int x = 0; x < out_dim4; x++)
                     {
                         for (int y = 0; y < out_dim5; y++)
                         {
-                            output[n * outStride0 + h * outStride1 + w * outStride2 + c * outStride3 + x * outStride4 +
-                                   y] = input[n * stride0 + h * stride1 + w * stride2 + c * stride3 + x * stride4 +
-                                              y * stride5];
+                            output[n * outStride0 + h * outStride1 + w * outStride2 + c * outStride3 + x * outStride4 + y] = input[n * stride0 + h * stride1 + w * stride2 + c * stride3 + x * stride4 + y * stride5];
                         }
                     }
                 }
@@ -254,23 +249,23 @@ static int ref_transpose_fp32(float* input, float* output, const struct ref_tran
 {
     switch (param->dims)
     {
-        case 2:
-            transpose2d(input, output, param);
-            break;
-        case 3:
-            transpose3d(input, output, param);
-            break;
-        case 4:
-            transpose4d(input, output, param);
-            break;
-        case 5:
-            transpose5d(input, output, param);
-            break;
-        case 6:
-            transpose6d(input, output, param);
-            break;
-        default:
-            break;
+    case 2:
+        transpose2d(input, output, param);
+        break;
+    case 3:
+        transpose3d(input, output, param);
+        break;
+    case 4:
+        transpose4d(input, output, param);
+        break;
+    case 5:
+        transpose5d(input, output, param);
+        break;
+    case 6:
+        transpose6d(input, output, param);
+        break;
+    default:
+        break;
     }
     return 0;
 }
@@ -287,33 +282,33 @@ static int ref_transpose_uint8(struct tensor* input_tensor, struct tensor* outpu
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* input = ( float* )sys_malloc(input_size * sizeof(float));
-    float* output = ( float* )sys_malloc(output_size * sizeof(float));
+    float* input = (float*)sys_malloc(input_size * sizeof(float));
+    float* output = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        input[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        input[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     switch (param->dims)
     {
-        case 2:
-            transpose2d(input, output, param);
-            break;
-        case 3:
-            transpose3d(input, output, param);
-            break;
-        case 4:
-            transpose4d(input, output, param);
-            break;
-        case 5:
-            transpose5d(input, output, param);
-            break;
-        case 6:
-            transpose6d(input, output, param);
-            break;
-        default:
-            break;
+    case 2:
+        transpose2d(input, output, param);
+        break;
+    case 3:
+        transpose3d(input, output, param);
+        break;
+    case 4:
+        transpose4d(input, output, param);
+        break;
+    case 5:
+        transpose5d(input, output, param);
+        break;
+    case 6:
+        transpose6d(input, output, param);
+        break;
+    default:
+        break;
     }
 
     /* quant */
@@ -328,15 +323,14 @@ static int ref_transpose_uint8(struct tensor* input_tensor, struct tensor* outpu
     }
 
     sys_free(input);
-    sys_free(output); 
+    sys_free(output);
 
     return 0;
 }
 
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct ref_transpose_param* op_param =
-        ( struct ref_transpose_param* )sys_malloc(sizeof(struct ref_transpose_param));
+    struct ref_transpose_param* op_param = (struct ref_transpose_param*)sys_malloc(sizeof(struct ref_transpose_param));
     memset(op_param, 0, sizeof(struct ref_transpose_param));
     exec_node->ops_priv = op_param;
 
@@ -359,13 +353,13 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct ref_transpose_param* op_param = ( struct ref_transpose_param* )exec_node->ops_priv;
-    struct transpose_param* transpose_param = ( struct transpose_param* )ir_node->op.param_mem;
+    struct ref_transpose_param* op_param = (struct ref_transpose_param*)exec_node->ops_priv;
+    struct transpose_param* transpose_param = (struct transpose_param*)ir_node->op.param_mem;
     int tr_size = transpose_param->tr_shape_size;
     // int tr_size = 2 ;
-    op_param->permute = ( int* )sys_malloc(tr_size * sizeof(int));
+    op_param->permute = (int*)sys_malloc(tr_size * sizeof(int));
     op_param->dims = input_tensor->dim_num;
-    op_param->in_dims = ( int* )sys_malloc(op_param->dims * sizeof(int));
+    op_param->in_dims = (int*)sys_malloc(op_param->dims * sizeof(int));
 
     return 0;
 }
@@ -373,7 +367,7 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
-    struct ref_transpose_param* op_param = ( struct ref_transpose_param* )exec_node->ops_priv;
+    struct ref_transpose_param* op_param = (struct ref_transpose_param*)exec_node->ops_priv;
 
     sys_free(op_param->permute);
     sys_free(op_param->in_dims);
@@ -390,12 +384,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct transpose_param* transpose_param = ( struct transpose_param* )ir_node->op.param_mem;
+    struct transpose_param* transpose_param = (struct transpose_param*)ir_node->op.param_mem;
 
-    void* out_data = ( void* )output_tensor->data;
-    void* in_data = ( void* )input_tensor->data;
+    void* out_data = (void*)output_tensor->data;
+    void* in_data = (void*)input_tensor->data;
 
-    struct ref_transpose_param* op_param = ( struct ref_transpose_param* )exec_node->ops_priv;
+    struct ref_transpose_param* op_param = (struct ref_transpose_param*)exec_node->ops_priv;
 
     int tr_size = transpose_param->tr_shape_size;
 
@@ -404,7 +398,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
         op_param->permute[i] = transpose_param->tr_shape[i];
     }
 
-    for (int i = 0; i < ( int )op_param->dims; i++)
+    for (int i = 0; i < (int)op_param->dims; i++)
     {
         op_param->in_dims[i] = input_tensor->dims[i];
     }
@@ -412,8 +406,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_transpose_fp32((float*)in_data, (float*)out_data, op_param);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
-		ret = ref_transpose_uint8(input_tensor, output_tensor, op_param);
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
+        ret = ref_transpose_uint8(input_tensor, output_tensor, op_param);
 
     return ret;
 }
diff --git a/source/device/cpu/op/unary/unary_kernel_ref.h b/source/device/cpu/op/unary/unary_kernel_ref.h
index 9b44a3cee..7520ef0a0 100644
--- a/source/device/cpu/op/unary/unary_kernel_ref.h
+++ b/source/device/cpu/op/unary/unary_kernel_ref.h
@@ -25,7 +25,6 @@
 #ifndef __UNARY_KERNEL_REF_H__
 #define __UNARY_KERNEL_REF_H__
 
-
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
diff --git a/source/device/cpu/op/unary/unary_kernel_ref_fp32.c b/source/device/cpu/op/unary/unary_kernel_ref_fp32.c
index 06b129e19..58e0a4c2f 100644
--- a/source/device/cpu/op/unary/unary_kernel_ref_fp32.c
+++ b/source/device/cpu/op/unary/unary_kernel_ref_fp32.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 int ref_unary_fp32(struct tensor* input_tensor, struct tensor* output_tensor, struct unary_param* param)
 {
     float* in_data = (float*)input_tensor->data;
@@ -49,110 +48,110 @@ int ref_unary_fp32(struct tensor* input_tensor, struct tensor* output_tensor, st
 
     switch (type)
     {
-        case 0:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = fabs(in_data[i]);
-            }
-            break;
-        case 1:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = -(in_data[i]);
-            }
-            break;
-        case 2:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = floor(in_data[i]);
-            }
-            break;
-        case 3:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = ceil(in_data[i]);
-            }
-            break;
-        case 4:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = in_data[i] * in_data[i];
-            }
-            break;
-        case 5:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = sqrt(in_data[i]);
-            }
-            break;
-        case 6:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = 1.f / sqrt(in_data[i]);
-            }
-            break;
-        case 7:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = exp(in_data[i]);
-            }
-            break;
-        case 8:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = log(in_data[i]);
-            }
-            break;
-        case 9:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = sin(in_data[i]);
-            }
-            break;
-        case 10:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = cos(in_data[i]);
-            }
-            break;
-        case 11:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = tan(in_data[i]);
-            }
-            break;
-        case 12:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = asin(in_data[i]);
-            }
-            break;
-        case 13:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = acos(in_data[i]);
-            }
-            break;
-        case 14:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = atan(in_data[i]);
-            }
-            break;
-        case 15:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = 1.f / (in_data[i]);
-            }
-            break;
-        case 16:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = tanh(in_data[i]);
-            }
-            break;
-        default:
-            break;
+    case 0:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = fabs(in_data[i]);
+        }
+        break;
+    case 1:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = -(in_data[i]);
+        }
+        break;
+    case 2:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = floor(in_data[i]);
+        }
+        break;
+    case 3:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = ceil(in_data[i]);
+        }
+        break;
+    case 4:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = in_data[i] * in_data[i];
+        }
+        break;
+    case 5:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = sqrt(in_data[i]);
+        }
+        break;
+    case 6:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = 1.f / sqrt(in_data[i]);
+        }
+        break;
+    case 7:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = exp(in_data[i]);
+        }
+        break;
+    case 8:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = log(in_data[i]);
+        }
+        break;
+    case 9:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = sin(in_data[i]);
+        }
+        break;
+    case 10:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = cos(in_data[i]);
+        }
+        break;
+    case 11:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = tan(in_data[i]);
+        }
+        break;
+    case 12:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = asin(in_data[i]);
+        }
+        break;
+    case 13:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = acos(in_data[i]);
+        }
+        break;
+    case 14:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = atan(in_data[i]);
+        }
+        break;
+    case 15:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = 1.f / (in_data[i]);
+        }
+        break;
+    case 16:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = tanh(in_data[i]);
+        }
+        break;
+    default:
+        break;
     }
 
     return 0;
diff --git a/source/device/cpu/op/unary/unary_kernel_ref_uint8.c b/source/device/cpu/op/unary/unary_kernel_ref_uint8.c
index 98d04b637..cb2b0957c 100644
--- a/source/device/cpu/op/unary/unary_kernel_ref_uint8.c
+++ b/source/device/cpu/op/unary/unary_kernel_ref_uint8.c
@@ -38,7 +38,6 @@
 
 #include <math.h>
 
-
 int ref_unary_uint8(struct tensor* input_tensor, struct tensor* output_tensor, struct unary_param* param)
 {
     /* dequant */
@@ -51,12 +50,12 @@ int ref_unary_uint8(struct tensor* input_tensor, struct tensor* output_tensor, s
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* in_data = ( float* )sys_malloc(input_size * sizeof(float));
-    float* out_data = ( float* )sys_malloc(output_size * sizeof(float));
+    float* in_data = (float*)sys_malloc(input_size * sizeof(float));
+    float* out_data = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        in_data[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        in_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     int size = input_tensor->elem_num;
@@ -65,110 +64,110 @@ int ref_unary_uint8(struct tensor* input_tensor, struct tensor* output_tensor, s
 
     switch (type)
     {
-        case 0:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = fabs(in_data[i]);
-            }
-            break;
-        case 1:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = -(in_data[i]);
-            }
-            break;
-        case 2:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = floor(in_data[i]);
-            }
-            break;
-        case 3:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = ceil(in_data[i]);
-            }
-            break;
-        case 4:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = in_data[i] * in_data[i];
-            }
-            break;
-        case 5:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = sqrt(in_data[i]);
-            }
-            break;
-        case 6:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = 1.f / sqrt(in_data[i]);
-            }
-            break;
-        case 7:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = exp(in_data[i]);
-            }
-            break;
-        case 8:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = log(in_data[i]);
-            }
-            break;
-        case 9:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = sin(in_data[i]);
-            }
-            break;
-        case 10:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = cos(in_data[i]);
-            }
-            break;
-        case 11:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = tan(in_data[i]);
-            }
-            break;
-        case 12:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = asin(in_data[i]);
-            }
-            break;
-        case 13:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = acos(in_data[i]);
-            }
-            break;
-        case 14:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = atan(in_data[i]);
-            }
-            break;
-        case 15:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = 1.f / (in_data[i]);
-            }
-            break;
-        case 16:
-            for (int i = 0; i < size; i++)
-            {
-                out_data[i] = tanh(in_data[i]);
-            }
-            break;
-        default:
-            break;
+    case 0:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = fabs(in_data[i]);
+        }
+        break;
+    case 1:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = -(in_data[i]);
+        }
+        break;
+    case 2:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = floor(in_data[i]);
+        }
+        break;
+    case 3:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = ceil(in_data[i]);
+        }
+        break;
+    case 4:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = in_data[i] * in_data[i];
+        }
+        break;
+    case 5:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = sqrt(in_data[i]);
+        }
+        break;
+    case 6:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = 1.f / sqrt(in_data[i]);
+        }
+        break;
+    case 7:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = exp(in_data[i]);
+        }
+        break;
+    case 8:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = log(in_data[i]);
+        }
+        break;
+    case 9:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = sin(in_data[i]);
+        }
+        break;
+    case 10:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = cos(in_data[i]);
+        }
+        break;
+    case 11:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = tan(in_data[i]);
+        }
+        break;
+    case 12:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = asin(in_data[i]);
+        }
+        break;
+    case 13:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = acos(in_data[i]);
+        }
+        break;
+    case 14:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = atan(in_data[i]);
+        }
+        break;
+    case 15:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = 1.f / (in_data[i]);
+        }
+        break;
+    case 16:
+        for (int i = 0; i < size; i++)
+        {
+            out_data[i] = tanh(in_data[i]);
+        }
+        break;
+    default:
+        break;
     }
 
     /* quant */
diff --git a/source/device/cpu/op/unary/unary_ref.c b/source/device/cpu/op/unary/unary_ref.c
index 915c69bbf..0f9610a2e 100644
--- a/source/device/cpu/op/unary/unary_ref.c
+++ b/source/device/cpu/op/unary/unary_ref.c
@@ -36,7 +36,6 @@
 
 #include "unary_kernel_ref.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -54,15 +53,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct unary_param* unary_param = ( struct unary_param* )ir_node->op.param_mem;
+    struct unary_param* unary_param = (struct unary_param*)ir_node->op.param_mem;
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_unary_fp32(input_tensor, output_tensor, unary_param);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_unary_uint8(input_tensor, output_tensor, unary_param);
     else
-        TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);        
+        TLOG_ERR("Input data type %d not to be supported.\n", input_tensor->data_type);
 
     return ret;
 }
diff --git a/source/device/cpu/op/unsqueeze/unsqueeze_ref.c b/source/device/cpu/op/unsqueeze/unsqueeze_ref.c
index 2fcb30b0d..70847a7d9 100644
--- a/source/device/cpu/op/unsqueeze/unsqueeze_ref.c
+++ b/source/device/cpu/op/unsqueeze/unsqueeze_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 int ref_unsqueeze_fp32(struct tensor* input_tensor, struct tensor* output_tensor)
 {
     float* input_data = (float*)input_tensor->data;
@@ -80,10 +79,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-	int ret = -1;
+    int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_unsqueeze_fp32(input_tensor, output_tensor);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_unsqueeze_uint8(input_tensor, output_tensor);
 
     return ret;
diff --git a/source/device/cpu/op/upsample/upsample_ref.c b/source/device/cpu/op/upsample/upsample_ref.c
index d6aa8d7e8..23ea6ff99 100644
--- a/source/device/cpu/op/upsample/upsample_ref.c
+++ b/source/device/cpu/op/upsample/upsample_ref.c
@@ -36,7 +36,6 @@
 
 #include <math.h>
 
-
 static int ref_upsample_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
                              struct upsample_param* param, int num_thread)
 {
@@ -96,12 +95,12 @@ static int ref_upsample_uint8(struct tensor* input_tensor, struct tensor* output
     int input_size = input_tensor->elem_num;
     int output_size = output_tensor->elem_num;
 
-    float* input_fp32 = ( float* )sys_malloc(input_size * sizeof(float));
-    float* output_fp32 = ( float* )sys_malloc(output_size * sizeof(float));
+    float* input_fp32 = (float*)sys_malloc(input_size * sizeof(float));
+    float* output_fp32 = (float*)sys_malloc(output_size * sizeof(float));
 
     for (int i = 0; i < input_size; i++)
     {
-        input_fp32[i] = (( float )input_uint8[i] - ( float )input_zero) * input_scale;
+        input_fp32[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
     /* fp32 inference */
@@ -160,7 +159,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct upsample_param* upsample_param = ( struct upsample_param* )ir_node->op.param_mem;
+    struct upsample_param* upsample_param = (struct upsample_param*)ir_node->op.param_mem;
 
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
diff --git a/source/device/cpu/op/where/where_ref.c b/source/device/cpu/op/where/where_ref.c
index 9a6fd7fe8..52a2fd778 100644
--- a/source/device/cpu/op/where/where_ref.c
+++ b/source/device/cpu/op/where/where_ref.c
@@ -32,7 +32,6 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
 
-
 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     return 0;
@@ -44,7 +43,7 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
 }
 static int ref_where_fp32(float* condition, float* data_a, float* data_b, float* output, int size)
 {
-    for(int i = 0; i < size; i++)
+    for (int i = 0; i < size; i++)
     {
         output[i] = condition[i] ? data_a[i] : data_b[i];
     }
@@ -63,19 +62,21 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     struct tensor* input_tensor_a = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* input_tensor_b = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
-    
+
     int elem_num_condition = input_tensor->elem_num;
     int elem_num_a = input_tensor_a->elem_num;
     int elem_num_b = input_tensor_b->elem_num;
 
-    if(elem_num_condition != elem_num_a || elem_num_condition != elem_num_b){
+    if (elem_num_condition != elem_num_a || elem_num_condition != elem_num_b)
+    {
         TLOG_ERR("Tensor size is not equal\n");
         return -1;
     }
 
-    int ret = ref_where_fp32((float*)input_tensor->data, (float*)input_tensor_a->data, 
-        (float*)input_tensor_b->data, (float*)output_tensor->data, elem_num_a);
-    if(ret < -1){
+    int ret = ref_where_fp32((float*)input_tensor->data, (float*)input_tensor_a->data,
+                             (float*)input_tensor_b->data, (float*)output_tensor->data, elem_num_a);
+    if (ret < -1)
+    {
         TLOG_ERR("where operator execution error\n");
         return -1;
     }
diff --git a/source/device/cpu/op/zeroslike/zeroslike_ref.c b/source/device/cpu/op/zeroslike/zeroslike_ref.c
index fd8ebf2f9..47b83d417 100644
--- a/source/device/cpu/op/zeroslike/zeroslike_ref.c
+++ b/source/device/cpu/op/zeroslike/zeroslike_ref.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 int ref_zeroslike_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     // dims size = 2 or 3
@@ -157,7 +156,7 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     int ret = -1;
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ret = ref_zeroslike_fp32(input_tensor, output_tensor, exec_graph->num_thread);
-    else if(input_tensor->data_type == TENGINE_DT_UINT8)
+    else if (input_tensor->data_type == TENGINE_DT_UINT8)
         ret = ref_zeroslike_uint8(input_tensor, output_tensor, exec_graph->num_thread);
 
     return ret;
diff --git a/source/device/cuda/cuda_device.hpp b/source/device/cuda/cuda_device.hpp
index af3359888..bbadedb17 100644
--- a/source/device/cuda/cuda_device.hpp
+++ b/source/device/cuda/cuda_device.hpp
@@ -28,8 +28,7 @@
 
 #define CUDA_DEV_NAME "CUDA"
 
-extern "C"
-{
+extern "C" {
 struct cuda_device
 {
     struct device base;
diff --git a/source/device/cuda/cuda_executor.hpp b/source/device/cuda/cuda_executor.hpp
index 410b223e8..15b6afd2b 100644
--- a/source/device/cuda/cuda_executor.hpp
+++ b/source/device/cuda/cuda_executor.hpp
@@ -28,8 +28,7 @@
 #include <vector>
 #include <functional>
 
-extern "C"
-{
+extern "C" {
 #include "graph/node.h"
 #include "graph/graph.h"
 #include "graph/subgraph.h"
@@ -42,7 +41,7 @@ extern "C"
 
 typedef std::map<uint32_t, uint32_t> dict_uint2uint;
 typedef std::map<uint32_t, void*> dict_uint2voidx;
-typedef std::function< void() >  GPU_kernel;
+typedef std::function<void()> GPU_kernel;
 
 class CUDAEngine
 {
@@ -84,5 +83,5 @@ class CUDAEngine
     cudnnConvolutionFwdAlgo_t algo1;
 
 public:
-    dict_uint2voidx     gpu_addr_map;
+    dict_uint2voidx gpu_addr_map;
 };
diff --git a/source/device/cuda/cuda_graph.hpp b/source/device/cuda/cuda_graph.hpp
index 1eaa3a230..72764181e 100644
--- a/source/device/cuda/cuda_graph.hpp
+++ b/source/device/cuda/cuda_graph.hpp
@@ -24,12 +24,10 @@
 
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "device/device.h"
 #include "graph/subgraph.h"
 
-
 int cuda_dev_init(struct device* dev);
 int cuda_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options);
 int cuda_dev_run(struct device* dev, struct subgraph* subgraph);
diff --git a/source/device/cuda/cuda_limit.hpp b/source/device/cuda/cuda_limit.hpp
index d9c34fc5d..02d89a264 100644
--- a/source/device/cuda/cuda_limit.hpp
+++ b/source/device/cuda/cuda_limit.hpp
@@ -34,26 +34,23 @@
 
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "operator/op.h"
 }
 
-
 const int cuda_supported_ops[] = {
-        OP_CLIP,
-        OP_CONCAT,
-        OP_CONST,
-        OP_CONV,
-        OP_DROPOUT,
-        OP_ELTWISE,
-        OP_FC,
-        OP_FLATTEN,
-        OP_INPUT,
-        OP_PERMUTE,
-        OP_POOL,
-        OP_RELU,
-        OP_RESHAPE,
-        OP_SLICE,
-        OP_SOFTMAX
-};
+    OP_CLIP,
+    OP_CONCAT,
+    OP_CONST,
+    OP_CONV,
+    OP_DROPOUT,
+    OP_ELTWISE,
+    OP_FC,
+    OP_FLATTEN,
+    OP_INPUT,
+    OP_PERMUTE,
+    OP_POOL,
+    OP_RELU,
+    OP_RESHAPE,
+    OP_SLICE,
+    OP_SOFTMAX};
diff --git a/source/device/device.c b/source/device/device.c
index 0ae392e49..c43bf4534 100644
--- a/source/device/device.c
+++ b/source/device/device.c
@@ -28,7 +28,6 @@
 
 #include <string.h>
 
-
 void init_ir_device(ir_device_t* device, const char* name)
 {
     if (NULL != name)
@@ -45,10 +44,9 @@ void init_ir_device(ir_device_t* device, const char* name)
     device->allocator = NULL;
     device->optimizer = NULL;
     device->scheduler = NULL;
-    device->privacy   = NULL;
+    device->privacy = NULL;
 }
 
-
 int get_device_option_size(ir_device_t* device)
 {
     // TODO: need an impl
diff --git a/source/device/device.h b/source/device/device.h
index 0c82cca12..f67f0d8ab 100644
--- a/source/device/device.h
+++ b/source/device/device.h
@@ -33,7 +33,6 @@ struct vector;
 
 #include <stddef.h>
 
-
 /*!
  * @struct ir_interface_t
  * @brief  Abstract neural network runnable device interface struct
@@ -65,7 +64,6 @@ typedef struct interface
     int (*release_device)(struct device* device);
 } ir_interface_t;
 
-
 /*!
  * @struct ir_allocator_t
  * @brief  Abstract neural network runnable device allocator struct
@@ -85,18 +83,16 @@ typedef struct allocator
     int (*release)(struct device*, struct subgraph*);
 } ir_allocator_t;
 
-
 /*!
  * @struct ir_optimizer_t
  * @brief  Abstract neural network runnable device expend optimizer
  */
 typedef struct optimizer
 {
-    int (*split_graph)(struct graph* ir_graph);                    //!< interface of split graph delegation
-    int (*optimize_graph)(struct graph* ir_graph, int precision);  //!< interface of optimizing graph delegation
+    int (*split_graph)(struct graph* ir_graph);                   //!< interface of split graph delegation
+    int (*optimize_graph)(struct graph* ir_graph, int precision); //!< interface of optimizing graph delegation
 } ir_optimizer_t;
 
-
 /*!
  * @struct nn_device_t
  * @brief  Abstract neural network runnable device description struct
@@ -104,14 +100,13 @@ typedef struct optimizer
 typedef struct device
 {
     const char* name;
-    struct interface* interface;      //!< device scheduler operation interface
-    struct allocator* allocator;      //!< device allocation operation interface
-    struct optimizer* optimizer;      //!< device optimizer operation interface
-    struct scheduler* scheduler;      //!< device scheduler
-    void*  privacy;                   //!< device privacy data
+    struct interface* interface; //!< device scheduler operation interface
+    struct allocator* allocator; //!< device allocation operation interface
+    struct optimizer* optimizer; //!< device optimizer operation interface
+    struct scheduler* scheduler; //!< device scheduler
+    void* privacy;               //!< device privacy data
 } ir_device_t;
 
-
 /*!
  * @brief  Initialize a device.
  *
@@ -122,7 +117,6 @@ typedef struct device
  */
 void init_ir_device(ir_device_t* device, const char* name);
 
-
 /*!
  * @brief  Size of a device option struct.
  *
diff --git a/source/device/opencl/ocl_define.h b/source/device/opencl/ocl_define.h
index 48a28a63f..010fc2651 100644
--- a/source/device/opencl/ocl_define.h
+++ b/source/device/opencl/ocl_define.h
@@ -26,9 +26,8 @@
 
 #define OCL_DEV_NAME "OCL"
 
-
 typedef struct ocl_option
 {
     char* dev_name;
-    int precision;      //!< precision of calculation
+    int precision; //!< precision of calculation
 } ocl_opt_t;
diff --git a/source/device/opencl/ocl_device.hpp b/source/device/opencl/ocl_device.hpp
index b76608f04..3ced1cb81 100644
--- a/source/device/opencl/ocl_device.hpp
+++ b/source/device/opencl/ocl_device.hpp
@@ -26,8 +26,7 @@
 
 #include "ocl_define.h"
 
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "device/device.h"
 
diff --git a/source/device/opencl/ocl_executor.hpp b/source/device/opencl/ocl_executor.hpp
index 5b5434b2c..9649a1d99 100644
--- a/source/device/opencl/ocl_executor.hpp
+++ b/source/device/opencl/ocl_executor.hpp
@@ -22,9 +22,7 @@
  * Author: lswang@openailab.com
  */
 
-
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "device/device.h"
 #include "graph/tensor.h"
@@ -57,15 +55,15 @@ struct OCLqueue
     int dims;
     cl_kernel queue_kernel;
     cl_event enentPoint;
-    size_t *queue_global_work_size;
-    size_t *queue_local_work_size;
+    size_t* queue_global_work_size;
+    size_t* queue_local_work_size;
 };
 
 class OCLEngine
 {
 public:
-//    OCLEngine();
-//    ~OCLEngine() = default;
+    //    OCLEngine();
+    //    ~OCLEngine() = default;
 
     int OCLEnginePreRun(struct subgraph* subgraph);
     int OCLEngineRun(struct subgraph* subgraph);
@@ -73,12 +71,11 @@ class OCLEngine
 
 private:
     bool init();
-    bool build_kernel(const char *filename, const char *kernel_name);
+    bool build_kernel(const char* filename, const char* kernel_name);
     bool OCLTensorMap(struct graph* ir_graph, int ir_tensor_idx, cl_mem_flags flag);
     int BuildTensor(struct subgraph* subgraph);
     int BuildKernel(struct subgraph* subgraph);
 
-
     bool AddClipNode(struct node* ir_node);
     bool AddConcatNode(struct node* ir_node);
     bool AddConvolutionNode(struct node* ir_node);
@@ -91,11 +88,10 @@ class OCLEngine
     bool AddReshapeNode(struct node* ir_node);
     bool AddSliceNode(struct node* ir_node);
 
-
 private:
-    cl_int    status;
+    cl_int status;
     cl_platform_id platform;
-    cl_device_id *devices;
+    cl_device_id* devices;
     cl_context context;
     cl_command_queue commandQueue;
 
@@ -103,13 +99,9 @@ class OCLEngine
     cl_kernel kernel;
 
 public:
-    dict_uint2clmem             ocl_tensor_map;
-    std::vector<struct OCLqueue>    queue_list;
+    dict_uint2clmem ocl_tensor_map;
+    std::vector<struct OCLqueue> queue_list;
 
 public:
     int bin_num;
-
 };
-
-
-
diff --git a/source/device/opencl/ocl_graph.hpp b/source/device/opencl/ocl_graph.hpp
index c5531f08c..6eed3d0de 100644
--- a/source/device/opencl/ocl_graph.hpp
+++ b/source/device/opencl/ocl_graph.hpp
@@ -24,12 +24,10 @@
 
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "device/device.h"
 #include "graph/subgraph.h"
 
-
 int ocl_dev_init(struct device* dev);
 int ocl_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options);
 int ocl_dev_run(struct device* dev, struct subgraph* subgraph);
diff --git a/source/device/opencl/ocl_helper.hpp b/source/device/opencl/ocl_helper.hpp
index 5fabe377d..e6f556b52 100644
--- a/source/device/opencl/ocl_helper.hpp
+++ b/source/device/opencl/ocl_helper.hpp
@@ -32,8 +32,7 @@
 #include <string>
 #include <fstream>
 
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "graph/tensor.h"
 #include "graph/node.h"
@@ -49,15 +48,14 @@ bool CHECK_ENQUEUE_KERNEL_STATUS(cl_int status);
 bool CHECK_ENQUEUE_BUFFER_STATUS(cl_int status);
 
 /** convert the kernel file into a string */
-int convertToString(const char *filename, std::string& s);
+int convertToString(const char* filename, std::string& s);
 
 /**Getting platforms and choose an available one.*/
-int getPlatform(cl_platform_id &platform);
+int getPlatform(cl_platform_id& platform);
 
 /**Step 2:Query the platform and choose the first GPU device if has one.*/
-cl_device_id *getCl_device_id(cl_platform_id &platform);
+cl_device_id* getCl_device_id(cl_platform_id& platform);
 
 void get_device_message();
 
 void dump_sub_graph(struct subgraph* sub_graph);
-
diff --git a/source/device/opencl/ocl_limit.hpp b/source/device/opencl/ocl_limit.hpp
index f319c1dea..da6c45a7e 100644
--- a/source/device/opencl/ocl_limit.hpp
+++ b/source/device/opencl/ocl_limit.hpp
@@ -22,139 +22,134 @@
  * Author: hhchen@openailab.com
  */
 
-
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "operator/op.h"
 }
 
-
 const int ocl_supported_ops[] = {
 
-        OP_CLIP,
-        OP_CONCAT,
-        OP_CONST,
-        OP_CONV,
-        OP_DROPOUT,
-        OP_ELTWISE,
-        OP_FC,
-        OP_FLATTEN,
-        OP_INPUT,
-////        OP_PERMUTE,
-        OP_POOL,
-        OP_RELU,
-        OP_RESHAPE,
-        OP_SLICE,
-////        OP_SOFTMAX
-
-
-//        OP_BIAS,
+    OP_CLIP,
+    OP_CONCAT,
+    OP_CONST,
+    OP_CONV,
+    OP_DROPOUT,
+    OP_ELTWISE,
+    OP_FC,
+    OP_FLATTEN,
+    OP_INPUT,
+    ////        OP_PERMUTE,
+    OP_POOL,
+    OP_RELU,
+    OP_RESHAPE,
+    OP_SLICE,
+    ////        OP_SOFTMAX
 
-////        OP_ABSVAL,
-////        OP_ADD_N,
-////        OP_ARGMAX,
-////        OP_ARGMIN,
-////        OP_BATCHNORM,
-////        OP_BATCHTOSPACEND,
-////        OP_BIAS,
-////        OP_BROADMUL,
-//
-////        OP_CAST,
-////        OP_CEIL,
-////        OP_CLIP,
-////        OP_COMPARISON,
-////        OP_CONCAT,
-//        OP_CONST,
-//        OP_CONV,
-////        OP_CROP,
-////        OP_DECONV,
-////        OP_DEPTHTOSPACE,
-////        OP_DETECTION_OUTPUT,
-////        OP_DETECTION_POSTPROCESS,
-//
-////        OP_DROPOUT,
-////        OP_ELTWISE,
-////        OP_ELU,
-////        OP_EMBEDDING,
-////        OP_EXPANDDIMS,
-////        OP_FC,
-////        OP_FLATTEN,
-////        OP_GATHER,
-////        OP_GEMM,
-////        OP_GRU,
-////        OP_HARDSIGMOID,
-////        OP_HARDSWISH,
-//        OP_INPUT,
-////        OP_INSTANCENORM,
-////        OP_INTERP,
-////        OP_LOGICAL,
-////        OP_LOGISTIC,
-////        OP_LRN,
-////        OP_LSTM,
-////        OP_MATMUL,
-////        OP_MAXIMUM,
-////        OP_MEAN,
-////        OP_MINIMUM,
-////        OP_MVN,
-////        OP_NOOP,
-////        OP_NORMALIZE,
-//
-////        OP_PAD,
-////        OP_PERMUTE,
-//        OP_POOL,
-////        OP_PRELU,
-////        OP_PRIORBOX,
-////        OP_PSROIPOOLING,
-////        OP_REDUCEL2,
-////        OP_REDUCTION,
-////        OP_REGION,
-//        OP_RELU,
-//
-////        OP_RELU6,
-////        OP_REORG,
-////        OP_RESHAPE,
-////        OP_RESIZE,
-////        OP_REVERSE,
-////        OP_RNN,
-////        OP_ROIALIGN,
-////        OP_ROIPOOLING,
-////        OP_ROUND,
-////        OP_RPN,
-////        OP_SCALE,
-////        OP_SELU,
-////        OP_SHUFFLECHANNEL,
-////        OP_SIGMOID,
-//
-////        OP_SLICE,
-////        OP_SOFTMAX,
-////        OP_SPACETOBATCHND,
-////        OP_SPACETODEPTH,
-////        OP_SPARSETODENSE,
-////        OP_SPLIT,
-////        OP_SQUAREDDIFFERENCE,
-////        OP_SQUEEZE,
-////        OP_STRIDED_SLICE,
-////        OP_SWAP_AXIS,
-////        OP_TANH,
-////        OP_THRESHOLD,
-////        OP_TOPKV2,
-////        OP_TRANSPOSE,
-////        OP_UNARY,
-////        OP_UNSQUEEZE,
-////        OP_UPSAMPLE,
-////        OP_ZEROSLIKE,
-////        OP_MISH,
-////        OP_LOGSOFTMAX,
-////        OP_RELU1,
-////        OP_L2NORMALIZATION,
-////        OP_L2POOL,
-////        OP_TILE,
-////        OP_SHAPE,
-////        OP_SCATTER,
-////        OP_WHERE,
-////        OP_BUILTIN_LAST
+    //        OP_BIAS,
 
+    ////        OP_ABSVAL,
+    ////        OP_ADD_N,
+    ////        OP_ARGMAX,
+    ////        OP_ARGMIN,
+    ////        OP_BATCHNORM,
+    ////        OP_BATCHTOSPACEND,
+    ////        OP_BIAS,
+    ////        OP_BROADMUL,
+    //
+    ////        OP_CAST,
+    ////        OP_CEIL,
+    ////        OP_CLIP,
+    ////        OP_COMPARISON,
+    ////        OP_CONCAT,
+    //        OP_CONST,
+    //        OP_CONV,
+    ////        OP_CROP,
+    ////        OP_DECONV,
+    ////        OP_DEPTHTOSPACE,
+    ////        OP_DETECTION_OUTPUT,
+    ////        OP_DETECTION_POSTPROCESS,
+    //
+    ////        OP_DROPOUT,
+    ////        OP_ELTWISE,
+    ////        OP_ELU,
+    ////        OP_EMBEDDING,
+    ////        OP_EXPANDDIMS,
+    ////        OP_FC,
+    ////        OP_FLATTEN,
+    ////        OP_GATHER,
+    ////        OP_GEMM,
+    ////        OP_GRU,
+    ////        OP_HARDSIGMOID,
+    ////        OP_HARDSWISH,
+    //        OP_INPUT,
+    ////        OP_INSTANCENORM,
+    ////        OP_INTERP,
+    ////        OP_LOGICAL,
+    ////        OP_LOGISTIC,
+    ////        OP_LRN,
+    ////        OP_LSTM,
+    ////        OP_MATMUL,
+    ////        OP_MAXIMUM,
+    ////        OP_MEAN,
+    ////        OP_MINIMUM,
+    ////        OP_MVN,
+    ////        OP_NOOP,
+    ////        OP_NORMALIZE,
+    //
+    ////        OP_PAD,
+    ////        OP_PERMUTE,
+    //        OP_POOL,
+    ////        OP_PRELU,
+    ////        OP_PRIORBOX,
+    ////        OP_PSROIPOOLING,
+    ////        OP_REDUCEL2,
+    ////        OP_REDUCTION,
+    ////        OP_REGION,
+    //        OP_RELU,
+    //
+    ////        OP_RELU6,
+    ////        OP_REORG,
+    ////        OP_RESHAPE,
+    ////        OP_RESIZE,
+    ////        OP_REVERSE,
+    ////        OP_RNN,
+    ////        OP_ROIALIGN,
+    ////        OP_ROIPOOLING,
+    ////        OP_ROUND,
+    ////        OP_RPN,
+    ////        OP_SCALE,
+    ////        OP_SELU,
+    ////        OP_SHUFFLECHANNEL,
+    ////        OP_SIGMOID,
+    //
+    ////        OP_SLICE,
+    ////        OP_SOFTMAX,
+    ////        OP_SPACETOBATCHND,
+    ////        OP_SPACETODEPTH,
+    ////        OP_SPARSETODENSE,
+    ////        OP_SPLIT,
+    ////        OP_SQUAREDDIFFERENCE,
+    ////        OP_SQUEEZE,
+    ////        OP_STRIDED_SLICE,
+    ////        OP_SWAP_AXIS,
+    ////        OP_TANH,
+    ////        OP_THRESHOLD,
+    ////        OP_TOPKV2,
+    ////        OP_TRANSPOSE,
+    ////        OP_UNARY,
+    ////        OP_UNSQUEEZE,
+    ////        OP_UPSAMPLE,
+    ////        OP_ZEROSLIKE,
+    ////        OP_MISH,
+    ////        OP_LOGSOFTMAX,
+    ////        OP_RELU1,
+    ////        OP_L2NORMALIZATION,
+    ////        OP_L2POOL,
+    ////        OP_TILE,
+    ////        OP_SHAPE,
+    ////        OP_SCATTER,
+    ////        OP_WHERE,
+    ////        OP_BUILTIN_LAST
 
 };
diff --git a/source/device/tensorrt/trt_define.h b/source/device/tensorrt/trt_define.h
index 93faa31d4..88fd302f1 100644
--- a/source/device/tensorrt/trt_define.h
+++ b/source/device/tensorrt/trt_define.h
@@ -24,16 +24,15 @@
 
 #pragma once
 
-#define TRT_DEVICE_NAME             "TensorRT"
-
-#define EXPORT_BEGIN                extern "C" {
-#define EXPORT_FINISH               }
+#define TRT_DEVICE_NAME "TensorRT"
 
+#define EXPORT_BEGIN  extern "C" {
+#define EXPORT_FINISH }
 
 typedef struct trt_option
 {
     char* dev_name;
-    int gpu_index;      //!< select which GPU to run graph
-    int dla_index;      //!< select to use NVIDIA DLA
-    int precision;      //!< precision of calculation
+    int gpu_index; //!< select which GPU to run graph
+    int dla_index; //!< select to use NVIDIA DLA
+    int precision; //!< precision of calculation
 } trt_opt_t;
diff --git a/source/device/tensorrt/trt_device.hpp b/source/device/tensorrt/trt_device.hpp
index d4cbd2873..8c2275049 100644
--- a/source/device/tensorrt/trt_device.hpp
+++ b/source/device/tensorrt/trt_device.hpp
@@ -30,13 +30,11 @@ EXPORT_BEGIN
 #include "api/c_api.h"
 #include "device/device.h"
 
-
 struct trt_device
 {
     struct device base;
 };
 
-
 DLLEXPORT int register_cpu_device(void);
 
 EXPORT_FINISH
diff --git a/source/device/tensorrt/trt_executor.hpp b/source/device/tensorrt/trt_executor.hpp
index 0954ed436..b7523f0a2 100644
--- a/source/device/tensorrt/trt_executor.hpp
+++ b/source/device/tensorrt/trt_executor.hpp
@@ -42,7 +42,6 @@ EXPORT_FINISH
 #include <map>
 #include <vector>
 
-
 class TensorRTEngine
 {
 public:
@@ -63,7 +62,7 @@ class TensorRTEngine
     int get_type(int mode, nvinfer1::DataType& type);
 
 private:
-    size_t   card_id;
+    size_t card_id;
     uint16_t tensor_swap_count;
 
     std::map<uint16_t, nvinfer1::ITensor*> tensor_real_map;
@@ -116,5 +115,5 @@ class TensorRTEngine
     nvinfer1::INetworkDefinition* network;
     nvinfer1::IBuilderConfig* config;
     nvinfer1::ICudaEngine* engine;
-    nvinfer1::IExecutionContext *context;
+    nvinfer1::IExecutionContext* context;
 };
diff --git a/source/device/tensorrt/trt_graph.hpp b/source/device/tensorrt/trt_graph.hpp
index 0ceb1f88f..7050eb79b 100644
--- a/source/device/tensorrt/trt_graph.hpp
+++ b/source/device/tensorrt/trt_graph.hpp
@@ -34,7 +34,6 @@ EXPORT_BEGIN
 #include "graph/subgraph.h"
 #include "device/device.h"
 
-
 int trt_dev_init(struct device* dev);
 int trt_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options);
 int trt_dev_run(struct device* dev, struct subgraph* subgraph);
diff --git a/source/device/tensorrt/trt_helper.hpp b/source/device/tensorrt/trt_helper.hpp
index 6e8640886..63fa3c35d 100644
--- a/source/device/tensorrt/trt_helper.hpp
+++ b/source/device/tensorrt/trt_helper.hpp
@@ -42,7 +42,6 @@
 #include <ostream>
 #include <string>
 
-
 #ifdef _MSC_VER
 #define FN_NAME __FUNCTION__
 #else
@@ -53,40 +52,53 @@
 #define ENABLE_DLA_API 1
 #endif
 
-#define CHECK(status)                                                       \
-    do                                                                      \
-    {                                                                       \
-        auto ret = (status);                                                \
-        if (ret != 0)                                                       \
-        {                                                                   \
-            Log(Loglevel, "TensorRT Engine",  "Cuda failure: %d", ret);     \
-            abort();                                                        \
-        }                                                                   \
+#define CHECK(status)                                                  \
+    do                                                                 \
+    {                                                                  \
+        auto ret = (status);                                           \
+        if (ret != 0)                                                  \
+        {                                                              \
+            Log(Loglevel, "TensorRT Engine", "Cuda failure: %d", ret); \
+            abort();                                                   \
+        }                                                              \
     } while (0)
 
-
 constexpr long double operator"" _GiB(long double val)
 {
     return val * (1 << 30);
 }
-constexpr long double operator"" _MiB(long double val) { return val * (1 << 20); }
-constexpr long double operator"" _KiB(long double val) { return val * (1 << 10); }
+constexpr long double operator"" _MiB(long double val)
+{
+    return val * (1 << 20);
+}
+constexpr long double operator"" _KiB(long double val)
+{
+    return val * (1 << 10);
+}
 
 // These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB.
 // Since the return type is signed, -1_GiB will work as expected.
-constexpr long long int operator"" _GiB(long long unsigned int val) { return val * (1 << 30); }
-constexpr long long int operator"" _MiB(long long unsigned int val) { return val * (1 << 20); }
-constexpr long long int operator"" _KiB(long long unsigned int val) { return val * (1 << 10); }
-
-
+constexpr long long int operator"" _GiB(long long unsigned int val)
+{
+    return val * (1 << 30);
+}
+constexpr long long int operator"" _MiB(long long unsigned int val)
+{
+    return val * (1 << 20);
+}
+constexpr long long int operator"" _KiB(long long unsigned int val)
+{
+    return val * (1 << 10);
+}
 
-class Logger :public nvinfer1::ILogger
+class Logger : public nvinfer1::ILogger
 {
 public:
     nvinfer1::ILogger::Severity severity_;
 
 public:
-    Logger(nvinfer1::ILogger::Severity severity = nvinfer1::ILogger::Severity::kINFO) :severity_(severity) {};
+    Logger(nvinfer1::ILogger::Severity severity = nvinfer1::ILogger::Severity::kINFO)
+        : severity_(severity){};
 
     void log(Severity severity, const char* msg) override
     {
@@ -94,21 +106,21 @@ class Logger :public nvinfer1::ILogger
         {
             switch (severity)
             {
-                case nvinfer1::ILogger::Severity::kINTERNAL_ERROR:
-                    fprintf(stderr, "Tengine Fatal: %s\n", msg);
-                    break;
-                case nvinfer1::ILogger::Severity::kERROR:
-                    fprintf(stderr, "Tengine Error: %s\n", msg);
-                    break;
-                case nvinfer1::ILogger::Severity::kWARNING:
-                    fprintf(stderr, "Tengine Warning: %s\n", msg);
-                    break;
-                case nvinfer1::ILogger::Severity::kINFO:
-                    fprintf(stderr, "Tengine Info: %s\n", msg);
-                    break;
-                default:
-                    fprintf(stderr, "Tengine Normal: %s\n", msg);
-                    break;
+            case nvinfer1::ILogger::Severity::kINTERNAL_ERROR:
+                fprintf(stderr, "Tengine Fatal: %s\n", msg);
+                break;
+            case nvinfer1::ILogger::Severity::kERROR:
+                fprintf(stderr, "Tengine Error: %s\n", msg);
+                break;
+            case nvinfer1::ILogger::Severity::kWARNING:
+                fprintf(stderr, "Tengine Warning: %s\n", msg);
+                break;
+            case nvinfer1::ILogger::Severity::kINFO:
+                fprintf(stderr, "Tengine Info: %s\n", msg);
+                break;
+            default:
+                fprintf(stderr, "Tengine Normal: %s\n", msg);
+                break;
             }
         }
         else
@@ -128,10 +140,9 @@ class Logger :public nvinfer1::ILogger
     }
 };
 
-
 struct InferDeleter
 {
-    template <typename T>
+    template<typename T>
     void operator()(T* obj) const
     {
         if (obj)
@@ -141,7 +152,6 @@ struct InferDeleter
     }
 };
 
-
 inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true)
 {
     if (useDLACore >= 0)
@@ -166,7 +176,6 @@ inline void enableDLA(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* con
     }
 }
 
-
 // Ensures that every tensor used by a network has a scale.
 //
 // All tensors in a network must have a range specified if a calibrator is not used.
@@ -187,7 +196,7 @@ void setAllTensorScales(nvinfer1::INetworkDefinition* network, float inScales =
         auto layer = network->getLayer(i);
         for (int j = 0; j < layer->getNbInputs(); j++)
         {
-            nvinfer1::ITensor* input{ layer->getInput(j) };
+            nvinfer1::ITensor* input{layer->getInput(j)};
             // Optional inputs are nullptr here and are from RNN layers.
             if (input != nullptr && !input->dynamicRangeIsSet())
             {
@@ -204,7 +213,7 @@ void setAllTensorScales(nvinfer1::INetworkDefinition* network, float inScales =
         auto layer = network->getLayer(i);
         for (int j = 0; j < layer->getNbOutputs(); j++)
         {
-            nvinfer1::ITensor* output{ layer->getOutput(j) };
+            nvinfer1::ITensor* output{layer->getOutput(j)};
             // Optional outputs are nullptr here and are from RNN layers.
             if (output != nullptr && !output->dynamicRangeIsSet())
             {
@@ -222,7 +231,6 @@ void setAllTensorScales(nvinfer1::INetworkDefinition* network, float inScales =
     }
 }
 
-
 struct CaffeBufferShutter
 {
     ~CaffeBufferShutter()
@@ -231,7 +239,6 @@ struct CaffeBufferShutter
     }
 };
 
-
 struct UffBufferShutter
 {
     ~UffBufferShutter()
@@ -240,9 +247,7 @@ struct UffBufferShutter
     }
 };
 
-
-template <typename T>
+template<typename T>
 using TensorRTSmartPoint = std::unique_ptr<T, InferDeleter>;
 
-
 using TensorRTShapeRange = std::array<nvinfer1::Dims, nvinfer1::EnumMax<nvinfer1::OptProfileSelector>()>;
diff --git a/source/device/tensorrt/trt_limit.hpp b/source/device/tensorrt/trt_limit.hpp
index 9380a28b3..aa39ef1af 100644
--- a/source/device/tensorrt/trt_limit.hpp
+++ b/source/device/tensorrt/trt_limit.hpp
@@ -42,57 +42,56 @@ EXPORT_FINISH
 
 #include <NvInfer.h>
 
-
 #if NV_TENSORRT_MAJOR < 5
 #error "Tengine: The minimum supported version of TensorRT is 5.\n"
 #endif
 
 const int trt_supported_ops[] = {
-        OP_ABSVAL,
-        OP_ADD_N,
+    OP_ABSVAL,
+    OP_ADD_N,
 #if NV_TENSORRT_MAJOR >= 6
 //        OP_ARGMAX,
 //        OP_ARGMIN,
 #endif
-        OP_BATCHNORM,
-        //OP_BATCHTOSPACEND,            // Not supported, last checked version 7.1.3
+    OP_BATCHNORM,
+//OP_BATCHTOSPACEND,            // Not supported, last checked version 7.1.3
 //        OP_BIAS,
 #if NV_TENSORRT_MAJOR >= 6
-//        OP_BROADMUL,
-//        OP_CAST,
-//        OP_CEIL,
-        OP_CLIP,
+    //        OP_BROADMUL,
+    //        OP_CAST,
+    //        OP_CEIL,
+    OP_CLIP,
 #endif
 #if NV_TENSORRT_MAJOR >= 7
 //        OP_COMPARISON,
 #endif
-        OP_CONCAT,
-        OP_CONST,
-        OP_CONV,
-        OP_CROP,
-        OP_DECONV,
-//        OP_DEPTHTOSPACE,
-        //OP_DETECTION_OUTPUT,          // Not supported, last checked version 7.1.3
-        //OP_DETECTION_POSTPROCESS,     // Not supported, last checked version 7.1.3
-        OP_DROPOUT,
-        OP_ELTWISE,
+    OP_CONCAT,
+    OP_CONST,
+    OP_CONV,
+    OP_CROP,
+    OP_DECONV,
+    //        OP_DEPTHTOSPACE,
+    //OP_DETECTION_OUTPUT,          // Not supported, last checked version 7.1.3
+    //OP_DETECTION_POSTPROCESS,     // Not supported, last checked version 7.1.3
+    OP_DROPOUT,
+    OP_ELTWISE,
 //        OP_ELU,
-        //OP_EMBEDDING,                 // Not supported, last checked version 7.1.3
+//OP_EMBEDDING,                 // Not supported, last checked version 7.1.3
 #if NV_TENSORRT_MAJOR >= 6
 //        OP_EXPANDDIMS,
 #endif
-        OP_FC,
-        OP_FLATTEN,
-//        OP_GATHER,
-        OP_GEMM,
+    OP_FC,
+    OP_FLATTEN,
+    //        OP_GATHER,
+    OP_GEMM,
 #if NV_TENSORRT_MAJOR >= 7
 //        OP_GRU,
 #endif
-//        OP_HARDSIGMOID,
-//        OP_HARDSWISH,                   // Not supported, last checked version 7.1.3
-        OP_INPUT,
-        OP_INSTANCENORM,
-        OP_INTERP,                      // should be as UpSample
+    //        OP_HARDSIGMOID,
+    //        OP_HARDSWISH,                   // Not supported, last checked version 7.1.3
+    OP_INPUT,
+    OP_INSTANCENORM,
+    OP_INTERP, // should be as UpSample
 //        OP_LOGICAL,
 #if NV_TENSORRT_MAJOR >= 7
 //        OP_LOGISTIC,
@@ -101,73 +100,73 @@ const int trt_supported_ops[] = {
 #if NV_TENSORRT_MAJOR >= 7
 //        OP_LSTM,
 #endif
-//        OP_MATMUL,
-//        OP_MAXIMUM,
-//        OP_MEAN,
-//        OP_MINIMUM,
-        //OP_MVN,                       // Not supported, last checked version 7.1.3
-//        OP_NOOP,
-        //OP_NORMALIZE,                 // Not supported, last checked version 7.1.3
-        OP_PAD,
-        OP_PERMUTE,
-        OP_POOL,
-//        OP_PRELU,
-        //OP_PRIORBOX,                  // Not supported, last checked version 7.1.3
-        //OP_PSROIPOOLING,              // Not supported, last checked version 7.1.3
-//        OP_REDUCEL2,
-        OP_REDUCTION,
-        //OP_REGION,                    // Not supported, last checked version 7.1.3
-        OP_RELU,
-        OP_RELU6,
-        //OP_REORG,                     // Not supported, last checked version 7.1.3
-        OP_RESHAPE,
+    //        OP_MATMUL,
+    //        OP_MAXIMUM,
+    //        OP_MEAN,
+    //        OP_MINIMUM,
+    //OP_MVN,                       // Not supported, last checked version 7.1.3
+    //        OP_NOOP,
+    //OP_NORMALIZE,                 // Not supported, last checked version 7.1.3
+    OP_PAD,
+    OP_PERMUTE,
+    OP_POOL,
+    //        OP_PRELU,
+    //OP_PRIORBOX,                  // Not supported, last checked version 7.1.3
+    //OP_PSROIPOOLING,              // Not supported, last checked version 7.1.3
+    //        OP_REDUCEL2,
+    OP_REDUCTION,
+    //OP_REGION,                    // Not supported, last checked version 7.1.3
+    OP_RELU,
+    OP_RELU6,
+    //OP_REORG,                     // Not supported, last checked version 7.1.3
+    OP_RESHAPE,
 #if NV_TENSORRT_MAJOR >= 6
-        OP_RESIZE,
+    OP_RESIZE,
 #endif
-        //OP_REVERSE,                   // Not supported, last checked version 7.1.3
+//OP_REVERSE,                   // Not supported, last checked version 7.1.3
 #if NV_TENSORRT_MAJOR >= 7
 //        OP_RNN,
 #endif
-        //OP_ROIALIGN,                  // Not supported, last checked version 7.1.3
-        //OP_ROIPOOLING,                // Not supported, last checked version 7.1.3
-        //OP_ROUND,
-        //OP_RPN,
+//OP_ROIALIGN,                  // Not supported, last checked version 7.1.3
+//OP_ROIPOOLING,                // Not supported, last checked version 7.1.3
+//OP_ROUND,
+//OP_RPN,
 //        OP_SCALE,
 //        OP_SELU,
-        //OP_SHUFFLECHANNEL,            // Not supported, last checked version 7.1.3
+//OP_SHUFFLECHANNEL,            // Not supported, last checked version 7.1.3
 //        OP_SIGMOID,
 #if NV_TENSORRT_MAJOR >= 6
-        OP_SLICE,
+    OP_SLICE,
 #endif
-        OP_SOFTMAX,
-        //OP_SPACETOBATCHND,            // Not supported, last checked version 7.1.3
-//        OP_SPACETODEPTH,
-        //OP_SPARSETODENSE,             // Not supported, last checked version 7.1.3
-        OP_SPLIT,
-        //OP_SQUAREDDIFFERENCE,         // Not supported, last checked version 7.1.3
-        OP_SQUEEZE,
-        //OP_STRIDED_SLICE,             // Not supported, last checked version 7.1.3
-        //OP_SWAP_AXIS,
-//        OP_TANH,
-        //OP_THRESHOLD,                 // Not supported, last checked version 7.1.3
-        //OP_THRESHOLD,                 // Not supported, last checked version 7.1.3
-//        OP_TOPKV2,
-        OP_TRANSPOSE,
-//        OP_UNARY,
-//        OP_UNSQUEEZE,
-        OP_UPSAMPLE,
-        //OP_ZEROSLIKE,                 // Not supported, last checked version 7.1.3
-        OP_MISH,
+    OP_SOFTMAX,
+    //OP_SPACETOBATCHND,            // Not supported, last checked version 7.1.3
+    //        OP_SPACETODEPTH,
+    //OP_SPARSETODENSE,             // Not supported, last checked version 7.1.3
+    OP_SPLIT,
+    //OP_SQUAREDDIFFERENCE,         // Not supported, last checked version 7.1.3
+    OP_SQUEEZE,
+    //OP_STRIDED_SLICE,             // Not supported, last checked version 7.1.3
+    //OP_SWAP_AXIS,
+    //        OP_TANH,
+    //OP_THRESHOLD,                 // Not supported, last checked version 7.1.3
+    //OP_THRESHOLD,                 // Not supported, last checked version 7.1.3
+    //        OP_TOPKV2,
+    OP_TRANSPOSE,
+    //        OP_UNARY,
+    //        OP_UNSQUEEZE,
+    OP_UPSAMPLE,
+    //OP_ZEROSLIKE,                 // Not supported, last checked version 7.1.3
+    OP_MISH,
 //        OP_LOGSOFTMAX,
 #if NV_TENSORRT_MAJOR >= 6
-        OP_RELU1,
+    OP_RELU1,
 #endif
-        //OP_L2NORMALIZATION,         // Not supported, last checked version 7.1.3
-        //OP_L2POOL,                  // Not supported, last checked version 7.1.3
+//OP_L2NORMALIZATION,         // Not supported, last checked version 7.1.3
+//OP_L2POOL,                  // Not supported, last checked version 7.1.3
 #if NV_TENSORRT_MAJOR >= 7
 //        OP_TILE,
 #endif
-        OP_SHAPE,
+    OP_SHAPE,
 //        OP_SCATTER,
 #if NV_TENSORRT_MAJOR >= 7
 //        OP_WHERE,
diff --git a/source/device/tim-vx/timvx_device.hpp b/source/device/tim-vx/timvx_device.hpp
index 67b4c742b..d5aba6230 100644
--- a/source/device/tim-vx/timvx_device.hpp
+++ b/source/device/tim-vx/timvx_device.hpp
@@ -26,8 +26,7 @@
 
 #include "timvx_define.h"
 
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "device/device.h"
 
diff --git a/source/device/tim-vx/timvx_dump.c b/source/device/tim-vx/timvx_dump.c
index 7035f21d7..640e024e7 100644
--- a/source/device/tim-vx/timvx_dump.c
+++ b/source/device/tim-vx/timvx_dump.c
@@ -1,562 +1,559 @@
-
-#include "timvx_dump.h"
-
-#include "device/device.h"
-#include "graph/tensor.h"
-#include "graph/node.h"
-#include "graph/graph.h"
-#include "graph/subgraph.h"
-#include "operator/op.h"
-#include "utility/log.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#ifdef _MSC_VER
-#include <windows.h>
-#else
-#include <sys/stat.h>
-#include <sys/time.h>
-#endif
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#ifdef _MSC_VER
-#include <windows.h>
-#else
-#include <sys/stat.h>
-#include <sys/time.h>
-#endif
-
-
-int print_tensor_data_value_timvx(FILE* file, const struct tensor* tensor, int offset)
-{
-    switch (tensor->data_type)
-    {
-        case TENGINE_DT_FP32:
-        {
-            float* base_ptr = (float*)tensor->data;
-            float val = base_ptr[offset];
-            if (val < 0)
-                fprintf(file, "%.4f ", val);
-            else
-                fprintf(file, " %.4f ", val);
-            break;
-        }
-//        case TENGINE_DT_FP16:
-//        {
-//            fp16_t* base_ptr = (fp16_t*)tensor->data;
-//            fp16_t val = base_ptr[offset];
-//
-//            float val_fp32 = fp16_to_fp32(val);
-//
-//            if (val_fp32 < 0)
-//                fprintf(file, "%.4f ", val_fp32);
-//            else
-//                fprintf(file, " %.4f ", val_fp32);
-//            break;
-//        }
-        case TENGINE_DT_UINT8:
-        {
-            uint8_t* base_ptr = (uint8_t*)tensor->data;
-            uint8_t val = base_ptr[offset];
-
-            float scale = tensor->scale;
-            int32_t zero_point = tensor->zero_point;
-
-            float val_fp32 = (float)((int)val - (int)zero_point) * scale;
-            if (val_fp32 < 0)
-                fprintf(file, "%.4f ", val_fp32);
-            else
-                fprintf(file, " %.4f ", val_fp32);
-            break;
-        }
-        case TENGINE_DT_INT8:
-        {
-            int8_t* base_ptr = (int8_t*)tensor->data;
-            int8_t val = base_ptr[offset];
-
-            float scale = tensor->scale;
-
-            float val_fp32 = (float)val * scale;
-            if (val_fp32 < 0)
-                fprintf(file, "%.4f ", val_fp32);
-            else
-                fprintf(file, " %.4f ", val_fp32);
-        }
-        case TENGINE_DT_INT32:
-        {
-            int32_t* base_ptr = (int32_t*)tensor->data;
-            int8_t val = base_ptr[offset];
-
-            float scale = tensor->scale;
-            float val_fp32 = (float)val * scale;
-
-            if (val_fp32 < 0)
-                fprintf(file, "%.6f ", val_fp32);
-            else
-                fprintf(file, " %.6f ", val_fp32);
-        }
-    }
-
-    return 0;
-}
-
-const char* get_tensor_data_type_string_timvx(int data_type)
-{
-    switch (data_type)
-    {
-        case TENGINE_DT_FP32:
-            return "fp32";
-        case TENGINE_DT_FP16:
-            return "fp16";
-        case TENGINE_DT_INT8:
-            return "int8";
-        case TENGINE_DT_UINT8:
-            return "uint8";
-        case TENGINE_DT_INT32:
-            return "int32";
-        case TENGINE_DT_INT16:
-            return "int16";
-        default:
-            return "unknown";
-    }
-}
-
-void print_tensor_data_to_file_timvx(FILE* file, const struct tensor* tensor)
-{
-    switch (tensor->dim_num)
-    {
-        case 5:
-        {
-            int dim5 = tensor->dims[0], batch = tensor->dims[1], channel = 0, height = 0, width = 0;
-
-            if (TENGINE_LAYOUT_NCHW == tensor->layout)
-            {
-                channel = tensor->dims[2];
-                height = tensor->dims[3];
-                width = tensor->dims[4];
-            }
-            if (TENGINE_LAYOUT_NHWC == tensor->layout)
-            {
-                height = tensor->dims[2];
-                width = tensor->dims[3];
-                channel = tensor->dims[4];
-            }
-
-            if (TENGINE_DT_FP32 == tensor->data_type)
-            {
-                fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp32\n", dim5, batch, channel, height, width);
-            }
-            else
-            {
-                if (TENGINE_DT_FP16 == tensor->data_type)
-                {
-                    fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp16, cast to fp32\n", dim5, batch, channel, height, width);
-                }
-                else
-                {
-                    const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type);
-                    fprintf(file, "Shape is {%d %d %d %d %d}, data type is %s, inverse quantization to fp32\n", dim5, batch, channel, height, width, type_name);
-                }
-            }
-
-            for (int d5 = 0; d5 < dim5; d5++)
-            {
-                fprintf(file, "Dim5 %d:\n", d5);
-
-                for (int n = 0; n < batch; n++)
-                {
-                    fprintf(file, "\tBatch %d:\n", n);
-
-                    for (int ch = 0; ch < channel; ch++)
-                    {
-                        fprintf(file, "\t\tChannel %d:\n", ch);
-
-                        for (int h = 0; h < height; h++)
-                        {
-                            fprintf(file, "\t\t\t");
-
-                            for (int w = 0; w < width; w++)
-                            {
-                                int offset = 0;
-
-                                if (TENGINE_LAYOUT_NCHW == tensor->layout)
-                                {
-                                    offset += d5 * batch * channel * height * width;
-                                    offset += n * channel * height * width;
-                                    offset += ch * height * width;
-                                    offset += h * width;
-                                    offset += w;
-                                }
-                                if (TENGINE_LAYOUT_NHWC == tensor->layout)
-                                {
-                                    offset += d5 * batch * channel * height * width;
-                                    offset += n * channel * height * width;
-                                    offset += ch;
-                                    offset += h * width * channel;
-                                    offset += w * channel;
-                                }
-
-                                print_tensor_data_value_timvx(file, tensor, offset);
-                            }
-                            fprintf(file, "\n");
-                        }
-                        fprintf(file, "\n");
-                    }
-                    fprintf(file, "\n");
-                }
-                fprintf(file, "\n");
-            }
-
-            break;
-        }
-        case 4:
-        {
-            int batch = tensor->dims[0], channel = 0, height = 0, width = 0;
-
-            if (TENGINE_LAYOUT_NCHW == tensor->layout)
-            {
-                channel = tensor->dims[1];
-                height = tensor->dims[2];
-                width = tensor->dims[3];
-            }
-            if (TENGINE_LAYOUT_NHWC == tensor->layout)
-            {
-                height = tensor->dims[1];
-                width = tensor->dims[2];
-                channel = tensor->dims[3];
-            }
-
-            if (TENGINE_DT_FP32 == tensor->data_type)
-            {
-                fprintf(file, "Shape is {%d %d %d %d}, data type is fp32\n", batch, channel, height, width);
-            }
-            else
-            {
-                if (TENGINE_DT_FP16 == tensor->data_type)
-                {
-                    fprintf(file, "Shape is {%d %d %d %d}, data type is fp16, cast to fp32\n", batch, channel, height, width);
-                }
-                else
-                {
-                    const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type);
-                    fprintf(file, "Shape is {%d %d %d %d}, data type is %s, inverse quantization to fp32\n", batch, channel, height, width, type_name);
-                }
-            }
-
-            for (int n = 0; n < batch; n++)
-            {
-                fprintf(file, "Batch %d:\n", n);
-
-                for (int ch = 0; ch < channel; ch++)
-                {
-                    fprintf(file, "\tChannel %d:\n", ch);
-
-                    for (int h = 0; h < height; h++)
-                    {
-                        fprintf(file, "\t\t");
-
-                        for (int w = 0; w < width; w++)
-                        {
-                            int offset = 0;
-
-                            if (TENGINE_LAYOUT_NCHW == tensor->layout)
-                            {
-                                offset += n * channel * height * width;
-                                offset += ch * height * width;
-                                offset += h * width;
-                                offset += w;
-                            }
-                            if (TENGINE_LAYOUT_NHWC == tensor->layout)
-                            {
-                                offset += n * channel * height * width;
-                                offset += ch;
-                                offset += h * width * channel;
-                                offset += w * channel;
-                            }
-
-                            print_tensor_data_value_timvx(file, tensor, offset);
-                        }
-                        fprintf(file, "\n");
-                    }
-                    fprintf(file, "\n");
-                }
-                fprintf(file, "\n");
-            }
-
-            break;
-        }
-        case 3:
-        {
-            int batch = 0, height = 0, width = 0;
-
-            if (TENGINE_LAYOUT_NCHW == tensor->layout)
-            {
-                batch = tensor->dims[0];
-                height = tensor->dims[1];
-                width = tensor->dims[2];
-            }
-            if (TENGINE_LAYOUT_NHWC == tensor->layout)
-            {
-                height = tensor->dims[0];
-                width = tensor->dims[1];
-                batch = tensor->dims[2];
-            }
-
-            if (TENGINE_DT_FP32 == tensor->data_type)
-            {
-                fprintf(file, "Shape is {%d %d %d}, data type is fp32\n", batch, height, width);
-            }
-            else
-            {
-                if (TENGINE_DT_FP16 == tensor->data_type)
-                {
-                    fprintf(file, "Shape is {%d %d %d}, data type is fp16, cast to fp32\n", batch, height, width);
-                }
-                else
-                {
-                    const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type);
-                    fprintf(file, "Shape is {%d %d %d}, data type is %s, inverse quantization to fp32\n", batch, height, width, type_name);
-                }
-            }
-
-            for (int n = 0; n < batch; n++)
-            {
-                for (int h = 0; h < height; h++)
-                {
-                    fprintf(file, "Channel %d:\n", h);
-                    fprintf(file, "\t");
-
-                    for (int w = 0; w < width; w++)
-                    {
-                        int offset = 0;
-
-                        if (TENGINE_LAYOUT_NCHW == tensor->layout)
-                        {
-                            offset += n * height * width;
-                            offset += h * width;
-                            offset += w;
-                        }
-                        if (TENGINE_LAYOUT_NHWC == tensor->layout)
-                        {
-                            offset += h;
-                            offset += n * width * height;
-                            offset += w * height;
-                        }
-
-                        print_tensor_data_value_timvx(file, tensor, offset);
-                    }
-                    fprintf(file, "\n");
-                }
-                fprintf(file, "\n");
-            }
-
-            break;
-        }
-        case 2:
-        {
-            int batch = 0, width = 0;
-
-            if (TENGINE_LAYOUT_NCHW == tensor->layout)
-            {
-                batch = tensor->dims[0];
-                width = tensor->dims[1];
-            }
-            if (TENGINE_LAYOUT_NHWC == tensor->layout)
-            {
-                batch = tensor->dims[0];
-                width = tensor->dims[1];
-            }
-
-            if (TENGINE_DT_FP32 == tensor->data_type)
-            {
-                fprintf(file, "Shape is {%d %d}, data type is fp32\n", batch, width);
-            }
-            else
-            {
-                if (TENGINE_DT_FP16 == tensor->data_type)
-                {
-                    fprintf(file, "Shape is {%d %d}, data type is fp16, cast to fp32\n", batch, width);
-                }
-                else
-                {
-                    const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type);
-                    fprintf(file, "Shape is {%d %d}, data type is %s, inverse quantization to fp32\n", batch, width, type_name);
-                }
-            }
-
-            for (int n = 0; n < batch; n++)
-            {
-                for (int w = 0; w < width; w++)
-                {
-                    int offset = 0;
-
-                    offset += n * width;
-                    offset += w;
-
-                    print_tensor_data_value_timvx(file, tensor, offset);
-                }
-                fprintf(file, "\n");
-            }
-
-            break;
-        }
-        case 1:
-        {
-            int width = tensor->dims[0];
-
-            fprintf(file, "Shape is {%d}, data type is fp32\n", width);
-
-
-            for (int w = 0; w < width; w++)
-            {
-                print_tensor_data_value_timvx(file, tensor, w);
-            }
-
-            break;
-        }
-        default:
-            printf("Input dimension %d not to be supported.\n", tensor->dim_num);
-    }
-}
-
-char* replace_string_character_timvx(char* src_str, char* dst_str, char* target_char, char* replaced_char)
-{
-    char* p;
-    char* _out = dst_str;
-    char* _str = src_str;
-    char* _src = target_char;
-    char* _dst = replaced_char;
-    size_t src_size = strlen(_src);
-    size_t dst_size = strlen(_dst);
-    size_t len = 0;
-
-    do
-    {
-        p = strstr(_str, _src);
-        if (p == 0)
-        {
-            strcpy(_out, _str);
-            return dst_str;
-        }
-        len = p - _str;
-        memcpy(_out, _str, len);
-        memcpy(_out + len, _dst, dst_size);
-        _str = p + src_size;
-        _out = _out + len + dst_size;
-    } while (p);
-
-    return dst_str;
-}
-
-void extract_feature_from_tensor_timvx(const char* comment, const char* layer_name, const struct tensor* tensor)
-{
-    // 1. deal with saving path
-    char save_dir[256] = { '0' };
-
-    const char *env_path = getenv(TENGINE_DUMP_DIR);
-
-    if (NULL != env_path && (256 - 2) > strlen(env_path))
-    {
-        strcpy(save_dir, env_path);
-
-        if ('/' == save_dir[strlen(env_path)] || '\\' == save_dir[strlen(env_path)])
-        {
-#ifdef _MSC_VER
-            save_dir[strlen(env_path)] = '\\';
-            save_dir[strlen(env_path) + 1] = 0;
-#else
-            save_dir[strlen(env_path)] = '/';
-            save_dir[strlen(env_path) + 1] = 0;
-#endif
-        }
-    }
-    else
-    {
-//        TLOG_WARNING("Tengine: Env var \"TENGINE_DUMP_DIR\" is too long(%d vs. 254). Using default path.\n", strlen(env_path));
-        sprintf(save_dir, "./output/");
-#ifdef _MSC_VER
-        CreateDirectoryA(save_dir, NULL);
-#else
-        int ret = mkdir(save_dir, S_IRWXU | S_IRGRP | S_IWGRP | S_IROTH);
-//        if (0 != ret)
-//        {
-//            TLOG_WARNING("Tengine: Create saving folder failed(%d), skip dump.\n", ret);
-//            return;
-//        }
-#endif
-    }
-
-    // 2. deal with layer name
-    char layer_short_name[64], layer_legal_name[64];
-
-    if (64 < strlen(layer_name))
-    {
-        memcpy(layer_short_name, layer_name, 64 - 1);
-        layer_short_name[64 - 1] = 0;
-    }
-    else
-    {
-        strcpy(layer_short_name, layer_name);
-    }
-
-    replace_string_character_timvx(layer_short_name, layer_legal_name, "/", "-");
-
-    // 3. join path
-    char output_file_path[512] = { '0' };
-
-    if (strlen(layer_legal_name) + strlen(save_dir) + strlen(comment) > 256 - 16)
-    {
-        TLOG_WARNING("Tengine: Name of saving file is too long(%d vs. %d), skip dump.\n", strlen(layer_legal_name) + strlen(save_dir) + strlen(comment), 256 - 16);
-        return;
-    }
-
-    sprintf(output_file_path, "%s%s_%s_blob_data.txt", save_dir, layer_legal_name, comment);
-
-    FILE* file = fopen(output_file_path, "w");
-    if (NULL == file)
-    {
-        fprintf(stderr, "Tengine: Open file(%s) failed, skip dump\n", output_file_path);
-        return;
-    }
-
-    print_tensor_data_to_file_timvx(file, tensor);
-
-    // close file
-    fclose(file);
-    file = NULL;
-}
-
-void dump_sub_graph_timvx(struct subgraph* sub_graph)
-{
-    TLOG_INFO("Sub graph[%d]: {%8s } has %d nodes, %d input tensors, %d output tensors.\n", sub_graph->index, sub_graph->device->name, sub_graph->node_num, sub_graph->input_num, sub_graph->output_num);
-    TLOG_INFO("\tSub nodes: [ ");
-
-    for (int j = 0; j < sub_graph->node_num - 1; j++)
-    {
-        int node_id = sub_graph->node_list[j];
-        TLOG_INFO("%d, ", node_id);
-    }
-    TLOG_INFO("%d ].\n", sub_graph->node_list[sub_graph->node_num - 1]);
-
-    TLOG_INFO("\tSub input tensors: [ ");
-    for (int j = 0; j < sub_graph->input_num - 1; j++)
-    {
-        int tensor_id = sub_graph->input_tensor_list[j];
-        TLOG_INFO("%d, ", tensor_id);
-    }
-    TLOG_INFO("%d ].\n", sub_graph->input_tensor_list[sub_graph->input_num - 1]);
-
-    TLOG_INFO("\tSub output tensors: [ ");
-    for (int j = 0; j < sub_graph->output_num - 1; j++)
-    {
-        int tensor_id = sub_graph->output_tensor_list[j];
-        TLOG_INFO("%d, ", tensor_id);
-    }
-    TLOG_INFO("%d ].\n", sub_graph->output_tensor_list[sub_graph->output_num - 1]);
+
+#include "timvx_dump.h"
+
+#include "device/device.h"
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "operator/op.h"
+#include "utility/log.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _MSC_VER
+#include <windows.h>
+#else
+#include <sys/stat.h>
+#include <sys/time.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _MSC_VER
+#include <windows.h>
+#else
+#include <sys/stat.h>
+#include <sys/time.h>
+#endif
+
+int print_tensor_data_value_timvx(FILE* file, const struct tensor* tensor, int offset)
+{
+    switch (tensor->data_type)
+    {
+    case TENGINE_DT_FP32:
+    {
+        float* base_ptr = (float*)tensor->data;
+        float val = base_ptr[offset];
+        if (val < 0)
+            fprintf(file, "%.4f ", val);
+        else
+            fprintf(file, " %.4f ", val);
+        break;
+    }
+        //        case TENGINE_DT_FP16:
+        //        {
+        //            fp16_t* base_ptr = (fp16_t*)tensor->data;
+        //            fp16_t val = base_ptr[offset];
+        //
+        //            float val_fp32 = fp16_to_fp32(val);
+        //
+        //            if (val_fp32 < 0)
+        //                fprintf(file, "%.4f ", val_fp32);
+        //            else
+        //                fprintf(file, " %.4f ", val_fp32);
+        //            break;
+        //        }
+    case TENGINE_DT_UINT8:
+    {
+        uint8_t* base_ptr = (uint8_t*)tensor->data;
+        uint8_t val = base_ptr[offset];
+
+        float scale = tensor->scale;
+        int32_t zero_point = tensor->zero_point;
+
+        float val_fp32 = (float)((int)val - (int)zero_point) * scale;
+        if (val_fp32 < 0)
+            fprintf(file, "%.4f ", val_fp32);
+        else
+            fprintf(file, " %.4f ", val_fp32);
+        break;
+    }
+    case TENGINE_DT_INT8:
+    {
+        int8_t* base_ptr = (int8_t*)tensor->data;
+        int8_t val = base_ptr[offset];
+
+        float scale = tensor->scale;
+
+        float val_fp32 = (float)val * scale;
+        if (val_fp32 < 0)
+            fprintf(file, "%.4f ", val_fp32);
+        else
+            fprintf(file, " %.4f ", val_fp32);
+    }
+    case TENGINE_DT_INT32:
+    {
+        int32_t* base_ptr = (int32_t*)tensor->data;
+        int8_t val = base_ptr[offset];
+
+        float scale = tensor->scale;
+        float val_fp32 = (float)val * scale;
+
+        if (val_fp32 < 0)
+            fprintf(file, "%.6f ", val_fp32);
+        else
+            fprintf(file, " %.6f ", val_fp32);
+    }
+    }
+
+    return 0;
+}
+
+const char* get_tensor_data_type_string_timvx(int data_type)
+{
+    switch (data_type)
+    {
+    case TENGINE_DT_FP32:
+        return "fp32";
+    case TENGINE_DT_FP16:
+        return "fp16";
+    case TENGINE_DT_INT8:
+        return "int8";
+    case TENGINE_DT_UINT8:
+        return "uint8";
+    case TENGINE_DT_INT32:
+        return "int32";
+    case TENGINE_DT_INT16:
+        return "int16";
+    default:
+        return "unknown";
+    }
+}
+
+void print_tensor_data_to_file_timvx(FILE* file, const struct tensor* tensor)
+{
+    switch (tensor->dim_num)
+    {
+    case 5:
+    {
+        int dim5 = tensor->dims[0], batch = tensor->dims[1], channel = 0, height = 0, width = 0;
+
+        if (TENGINE_LAYOUT_NCHW == tensor->layout)
+        {
+            channel = tensor->dims[2];
+            height = tensor->dims[3];
+            width = tensor->dims[4];
+        }
+        if (TENGINE_LAYOUT_NHWC == tensor->layout)
+        {
+            height = tensor->dims[2];
+            width = tensor->dims[3];
+            channel = tensor->dims[4];
+        }
+
+        if (TENGINE_DT_FP32 == tensor->data_type)
+        {
+            fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp32\n", dim5, batch, channel, height, width);
+        }
+        else
+        {
+            if (TENGINE_DT_FP16 == tensor->data_type)
+            {
+                fprintf(file, "Shape is {%d %d %d %d %d}, data type is fp16, cast to fp32\n", dim5, batch, channel, height, width);
+            }
+            else
+            {
+                const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type);
+                fprintf(file, "Shape is {%d %d %d %d %d}, data type is %s, inverse quantization to fp32\n", dim5, batch, channel, height, width, type_name);
+            }
+        }
+
+        for (int d5 = 0; d5 < dim5; d5++)
+        {
+            fprintf(file, "Dim5 %d:\n", d5);
+
+            for (int n = 0; n < batch; n++)
+            {
+                fprintf(file, "\tBatch %d:\n", n);
+
+                for (int ch = 0; ch < channel; ch++)
+                {
+                    fprintf(file, "\t\tChannel %d:\n", ch);
+
+                    for (int h = 0; h < height; h++)
+                    {
+                        fprintf(file, "\t\t\t");
+
+                        for (int w = 0; w < width; w++)
+                        {
+                            int offset = 0;
+
+                            if (TENGINE_LAYOUT_NCHW == tensor->layout)
+                            {
+                                offset += d5 * batch * channel * height * width;
+                                offset += n * channel * height * width;
+                                offset += ch * height * width;
+                                offset += h * width;
+                                offset += w;
+                            }
+                            if (TENGINE_LAYOUT_NHWC == tensor->layout)
+                            {
+                                offset += d5 * batch * channel * height * width;
+                                offset += n * channel * height * width;
+                                offset += ch;
+                                offset += h * width * channel;
+                                offset += w * channel;
+                            }
+
+                            print_tensor_data_value_timvx(file, tensor, offset);
+                        }
+                        fprintf(file, "\n");
+                    }
+                    fprintf(file, "\n");
+                }
+                fprintf(file, "\n");
+            }
+            fprintf(file, "\n");
+        }
+
+        break;
+    }
+    case 4:
+    {
+        int batch = tensor->dims[0], channel = 0, height = 0, width = 0;
+
+        if (TENGINE_LAYOUT_NCHW == tensor->layout)
+        {
+            channel = tensor->dims[1];
+            height = tensor->dims[2];
+            width = tensor->dims[3];
+        }
+        if (TENGINE_LAYOUT_NHWC == tensor->layout)
+        {
+            height = tensor->dims[1];
+            width = tensor->dims[2];
+            channel = tensor->dims[3];
+        }
+
+        if (TENGINE_DT_FP32 == tensor->data_type)
+        {
+            fprintf(file, "Shape is {%d %d %d %d}, data type is fp32\n", batch, channel, height, width);
+        }
+        else
+        {
+            if (TENGINE_DT_FP16 == tensor->data_type)
+            {
+                fprintf(file, "Shape is {%d %d %d %d}, data type is fp16, cast to fp32\n", batch, channel, height, width);
+            }
+            else
+            {
+                const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type);
+                fprintf(file, "Shape is {%d %d %d %d}, data type is %s, inverse quantization to fp32\n", batch, channel, height, width, type_name);
+            }
+        }
+
+        for (int n = 0; n < batch; n++)
+        {
+            fprintf(file, "Batch %d:\n", n);
+
+            for (int ch = 0; ch < channel; ch++)
+            {
+                fprintf(file, "\tChannel %d:\n", ch);
+
+                for (int h = 0; h < height; h++)
+                {
+                    fprintf(file, "\t\t");
+
+                    for (int w = 0; w < width; w++)
+                    {
+                        int offset = 0;
+
+                        if (TENGINE_LAYOUT_NCHW == tensor->layout)
+                        {
+                            offset += n * channel * height * width;
+                            offset += ch * height * width;
+                            offset += h * width;
+                            offset += w;
+                        }
+                        if (TENGINE_LAYOUT_NHWC == tensor->layout)
+                        {
+                            offset += n * channel * height * width;
+                            offset += ch;
+                            offset += h * width * channel;
+                            offset += w * channel;
+                        }
+
+                        print_tensor_data_value_timvx(file, tensor, offset);
+                    }
+                    fprintf(file, "\n");
+                }
+                fprintf(file, "\n");
+            }
+            fprintf(file, "\n");
+        }
+
+        break;
+    }
+    case 3:
+    {
+        int batch = 0, height = 0, width = 0;
+
+        if (TENGINE_LAYOUT_NCHW == tensor->layout)
+        {
+            batch = tensor->dims[0];
+            height = tensor->dims[1];
+            width = tensor->dims[2];
+        }
+        if (TENGINE_LAYOUT_NHWC == tensor->layout)
+        {
+            height = tensor->dims[0];
+            width = tensor->dims[1];
+            batch = tensor->dims[2];
+        }
+
+        if (TENGINE_DT_FP32 == tensor->data_type)
+        {
+            fprintf(file, "Shape is {%d %d %d}, data type is fp32\n", batch, height, width);
+        }
+        else
+        {
+            if (TENGINE_DT_FP16 == tensor->data_type)
+            {
+                fprintf(file, "Shape is {%d %d %d}, data type is fp16, cast to fp32\n", batch, height, width);
+            }
+            else
+            {
+                const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type);
+                fprintf(file, "Shape is {%d %d %d}, data type is %s, inverse quantization to fp32\n", batch, height, width, type_name);
+            }
+        }
+
+        for (int n = 0; n < batch; n++)
+        {
+            for (int h = 0; h < height; h++)
+            {
+                fprintf(file, "Channel %d:\n", h);
+                fprintf(file, "\t");
+
+                for (int w = 0; w < width; w++)
+                {
+                    int offset = 0;
+
+                    if (TENGINE_LAYOUT_NCHW == tensor->layout)
+                    {
+                        offset += n * height * width;
+                        offset += h * width;
+                        offset += w;
+                    }
+                    if (TENGINE_LAYOUT_NHWC == tensor->layout)
+                    {
+                        offset += h;
+                        offset += n * width * height;
+                        offset += w * height;
+                    }
+
+                    print_tensor_data_value_timvx(file, tensor, offset);
+                }
+                fprintf(file, "\n");
+            }
+            fprintf(file, "\n");
+        }
+
+        break;
+    }
+    case 2:
+    {
+        int batch = 0, width = 0;
+
+        if (TENGINE_LAYOUT_NCHW == tensor->layout)
+        {
+            batch = tensor->dims[0];
+            width = tensor->dims[1];
+        }
+        if (TENGINE_LAYOUT_NHWC == tensor->layout)
+        {
+            batch = tensor->dims[0];
+            width = tensor->dims[1];
+        }
+
+        if (TENGINE_DT_FP32 == tensor->data_type)
+        {
+            fprintf(file, "Shape is {%d %d}, data type is fp32\n", batch, width);
+        }
+        else
+        {
+            if (TENGINE_DT_FP16 == tensor->data_type)
+            {
+                fprintf(file, "Shape is {%d %d}, data type is fp16, cast to fp32\n", batch, width);
+            }
+            else
+            {
+                const char* type_name = get_tensor_data_type_string_timvx(tensor->data_type);
+                fprintf(file, "Shape is {%d %d}, data type is %s, inverse quantization to fp32\n", batch, width, type_name);
+            }
+        }
+
+        for (int n = 0; n < batch; n++)
+        {
+            for (int w = 0; w < width; w++)
+            {
+                int offset = 0;
+
+                offset += n * width;
+                offset += w;
+
+                print_tensor_data_value_timvx(file, tensor, offset);
+            }
+            fprintf(file, "\n");
+        }
+
+        break;
+    }
+    case 1:
+    {
+        int width = tensor->dims[0];
+
+        fprintf(file, "Shape is {%d}, data type is fp32\n", width);
+
+        for (int w = 0; w < width; w++)
+        {
+            print_tensor_data_value_timvx(file, tensor, w);
+        }
+
+        break;
+    }
+    default:
+        printf("Input dimension %d not to be supported.\n", tensor->dim_num);
+    }
+}
+
+char* replace_string_character_timvx(char* src_str, char* dst_str, char* target_char, char* replaced_char)
+{
+    char* p;
+    char* _out = dst_str;
+    char* _str = src_str;
+    char* _src = target_char;
+    char* _dst = replaced_char;
+    size_t src_size = strlen(_src);
+    size_t dst_size = strlen(_dst);
+    size_t len = 0;
+
+    do
+    {
+        p = strstr(_str, _src);
+        if (p == 0)
+        {
+            strcpy(_out, _str);
+            return dst_str;
+        }
+        len = p - _str;
+        memcpy(_out, _str, len);
+        memcpy(_out + len, _dst, dst_size);
+        _str = p + src_size;
+        _out = _out + len + dst_size;
+    } while (p);
+
+    return dst_str;
+}
+
+void extract_feature_from_tensor_timvx(const char* comment, const char* layer_name, const struct tensor* tensor)
+{
+    // 1. deal with saving path
+    char save_dir[256] = {'0'};
+
+    const char* env_path = getenv(TENGINE_DUMP_DIR);
+
+    if (NULL != env_path && (256 - 2) > strlen(env_path))
+    {
+        strcpy(save_dir, env_path);
+
+        if ('/' == save_dir[strlen(env_path)] || '\\' == save_dir[strlen(env_path)])
+        {
+#ifdef _MSC_VER
+            save_dir[strlen(env_path)] = '\\';
+            save_dir[strlen(env_path) + 1] = 0;
+#else
+            save_dir[strlen(env_path)] = '/';
+            save_dir[strlen(env_path) + 1] = 0;
+#endif
+        }
+    }
+    else
+    {
+        //        TLOG_WARNING("Tengine: Env var \"TENGINE_DUMP_DIR\" is too long(%d vs. 254). Using default path.\n", strlen(env_path));
+        sprintf(save_dir, "./output/");
+#ifdef _MSC_VER
+        CreateDirectoryA(save_dir, NULL);
+#else
+        int ret = mkdir(save_dir, S_IRWXU | S_IRGRP | S_IWGRP | S_IROTH);
+//        if (0 != ret)
+//        {
+//            TLOG_WARNING("Tengine: Create saving folder failed(%d), skip dump.\n", ret);
+//            return;
+//        }
+#endif
+    }
+
+    // 2. deal with layer name
+    char layer_short_name[64], layer_legal_name[64];
+
+    if (64 < strlen(layer_name))
+    {
+        memcpy(layer_short_name, layer_name, 64 - 1);
+        layer_short_name[64 - 1] = 0;
+    }
+    else
+    {
+        strcpy(layer_short_name, layer_name);
+    }
+
+    replace_string_character_timvx(layer_short_name, layer_legal_name, "/", "-");
+
+    // 3. join path
+    char output_file_path[512] = {'0'};
+
+    if (strlen(layer_legal_name) + strlen(save_dir) + strlen(comment) > 256 - 16)
+    {
+        TLOG_WARNING("Tengine: Name of saving file is too long(%d vs. %d), skip dump.\n", strlen(layer_legal_name) + strlen(save_dir) + strlen(comment), 256 - 16);
+        return;
+    }
+
+    sprintf(output_file_path, "%s%s_%s_blob_data.txt", save_dir, layer_legal_name, comment);
+
+    FILE* file = fopen(output_file_path, "w");
+    if (NULL == file)
+    {
+        fprintf(stderr, "Tengine: Open file(%s) failed, skip dump\n", output_file_path);
+        return;
+    }
+
+    print_tensor_data_to_file_timvx(file, tensor);
+
+    // close file
+    fclose(file);
+    file = NULL;
+}
+
+void dump_sub_graph_timvx(struct subgraph* sub_graph)
+{
+    TLOG_INFO("Sub graph[%d]: {%8s } has %d nodes, %d input tensors, %d output tensors.\n", sub_graph->index, sub_graph->device->name, sub_graph->node_num, sub_graph->input_num, sub_graph->output_num);
+    TLOG_INFO("\tSub nodes: [ ");
+
+    for (int j = 0; j < sub_graph->node_num - 1; j++)
+    {
+        int node_id = sub_graph->node_list[j];
+        TLOG_INFO("%d, ", node_id);
+    }
+    TLOG_INFO("%d ].\n", sub_graph->node_list[sub_graph->node_num - 1]);
+
+    TLOG_INFO("\tSub input tensors: [ ");
+    for (int j = 0; j < sub_graph->input_num - 1; j++)
+    {
+        int tensor_id = sub_graph->input_tensor_list[j];
+        TLOG_INFO("%d, ", tensor_id);
+    }
+    TLOG_INFO("%d ].\n", sub_graph->input_tensor_list[sub_graph->input_num - 1]);
+
+    TLOG_INFO("\tSub output tensors: [ ");
+    for (int j = 0; j < sub_graph->output_num - 1; j++)
+    {
+        int tensor_id = sub_graph->output_tensor_list[j];
+        TLOG_INFO("%d, ", tensor_id);
+    }
+    TLOG_INFO("%d ].\n", sub_graph->output_tensor_list[sub_graph->output_num - 1]);
 }
\ No newline at end of file
diff --git a/source/device/tim-vx/timvx_dump.h b/source/device/tim-vx/timvx_dump.h
index 8c9a607d3..7a9f1778b 100644
--- a/source/device/tim-vx/timvx_dump.h
+++ b/source/device/tim-vx/timvx_dump.h
@@ -1,35 +1,35 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2021, OPEN AI LAB
- * Author: lswang@openailab.com
- */
-
-#pragma once
-
-struct tensor;
-struct subgraph;
-
-#define TENGINE_DUMP_DIR            "TG_DEBUG_DUMP_DIR"
-#define TENGINE_DUMP_LAYER          "TG_DEBUG_DATA"
-
-void extract_feature_from_tensor_timvx(const char* comment, const char* layer_name, const struct tensor* tensor);
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: lswang@openailab.com
+ */
+
+#pragma once
+
+struct tensor;
+struct subgraph;
+
+#define TENGINE_DUMP_DIR   "TG_DEBUG_DUMP_DIR"
+#define TENGINE_DUMP_LAYER "TG_DEBUG_DATA"
+
+void extract_feature_from_tensor_timvx(const char* comment, const char* layer_name, const struct tensor* tensor);
+
 void dump_sub_graph_timvx(struct subgraph* sub_graph);
\ No newline at end of file
diff --git a/source/device/tim-vx/timvx_executor.hpp b/source/device/tim-vx/timvx_executor.hpp
index bdb1f8d4c..faedb1529 100644
--- a/source/device/tim-vx/timvx_executor.hpp
+++ b/source/device/tim-vx/timvx_executor.hpp
@@ -24,8 +24,7 @@
 
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "device/device.h"
 #include "graph/tensor.h"
 #include "graph/node.h"
@@ -45,7 +44,6 @@ extern "C"
 #include <vector>
 #include <cmath>
 
-
 #include "convolution_param.h"
 
 #include "tim/vx/tensor.h"
@@ -74,20 +72,18 @@ extern "C"
 #include "tim/vx/ops/split.h"
 #include "tim/vx/ops/transpose.h"
 
-#define SPEC_TYPE_CONV           1
-#define SPEC_TYPE_CONV_BIAS      2
-#define SPEC_TYPE_DWCONV         3
-#define SPEC_TYPE_INTERP         4
-#define SPEC_TYPE_OUTPUT         5
-#define SPEC_TYPE_PRELU          6
-#define SPEC_TYPE_SLICE          7
-#define SPEC_TYPE_RESHAPE        8
-#define SPEC_TYPE_INPUT          9
-
-
-typedef std::map<uint32_t, std::shared_ptr<tim::vx::Tensor>> dict_irt2vxt;
-typedef std::map<uint32_t, std::shared_ptr<tim::vx::Operation>> dict_irt2vxo;
+#define SPEC_TYPE_CONV      1
+#define SPEC_TYPE_CONV_BIAS 2
+#define SPEC_TYPE_DWCONV    3
+#define SPEC_TYPE_INTERP    4
+#define SPEC_TYPE_OUTPUT    5
+#define SPEC_TYPE_PRELU     6
+#define SPEC_TYPE_SLICE     7
+#define SPEC_TYPE_RESHAPE   8
+#define SPEC_TYPE_INPUT     9
 
+typedef std::map<uint32_t, std::shared_ptr<tim::vx::Tensor> > dict_irt2vxt;
+typedef std::map<uint32_t, std::shared_ptr<tim::vx::Operation> > dict_irt2vxo;
 
 class VXEngine
 {
@@ -136,15 +132,13 @@ class VXEngine
     bool AddTransposeNode(struct node* ir_node);
     bool AddUpsampleNode(struct node* ir_node);
 
-
 public:
     std::shared_ptr<tim::vx::Context> context;
     std::shared_ptr<tim::vx::Graph> graph;
     std::shared_ptr<tim::vx::Operation> ops;
     std::vector<char> nbg_buffer;
 
-
 private:
-    dict_irt2vxt     vx_tensor_map;
-    dict_irt2vxo     vx_node_map;
+    dict_irt2vxt vx_tensor_map;
+    dict_irt2vxo vx_node_map;
 };
diff --git a/source/device/tim-vx/timvx_graph.hpp b/source/device/tim-vx/timvx_graph.hpp
index defaf4cd8..156fc7a1e 100644
--- a/source/device/tim-vx/timvx_graph.hpp
+++ b/source/device/tim-vx/timvx_graph.hpp
@@ -24,12 +24,10 @@
 
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "device/device.h"
 #include "graph/subgraph.h"
 
-
 int timvx_dev_init(struct device* dev);
 int timvx_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options);
 int timvx_dev_run(struct device* dev, struct subgraph* subgraph);
diff --git a/source/device/tim-vx/timvx_limit.hpp b/source/device/tim-vx/timvx_limit.hpp
index acb2e6bb4..ba521dd12 100644
--- a/source/device/tim-vx/timvx_limit.hpp
+++ b/source/device/tim-vx/timvx_limit.hpp
@@ -24,113 +24,111 @@
 
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "operator/op.h"
 }
 
-
 const int timvx_supported_ops[] = {
 
-//    OP_GENERIC,
-//    OP_ABSVAL,
-//    OP_ADD_N,
-//    OP_ARGMAX,
-//    OP_ARGMIN,
+    //    OP_GENERIC,
+    //    OP_ABSVAL,
+    //    OP_ADD_N,
+    //    OP_ARGMAX,
+    //    OP_ARGMIN,
     OP_BATCHNORM,
-//    OP_BATCHTOSPACEND,
-//    OP_BIAS,
-//    OP_BROADMUL,
-//    OP_CAST,
-//    OP_CEIL,
+    //    OP_BATCHTOSPACEND,
+    //    OP_BIAS,
+    //    OP_BROADMUL,
+    //    OP_CAST,
+    //    OP_CEIL,
     OP_CLIP,
-//    OP_COMPARISON,
+    //    OP_COMPARISON,
     OP_CONCAT,
     OP_CONST,
     OP_CONV,
-//    OP_CROP,
+    //    OP_CROP,
     OP_DECONV,
     OP_DEPTHTOSPACE,
-//    OP_DETECTION_OUTPUT,
-//    OP_DETECTION_POSTPROCESS,
+    //    OP_DETECTION_OUTPUT,
+    //    OP_DETECTION_POSTPROCESS,
     OP_DROPOUT,
     OP_ELTWISE,
     OP_ELU,
-//    OP_EMBEDDING,
-//    OP_EXPANDDIMS,
+    //    OP_EMBEDDING,
+    //    OP_EXPANDDIMS,
     OP_FC,
     OP_FLATTEN,
     OP_GATHER,
-//    OP_GEMM,
-//    OP_GRU,
-//    OP_HARDSIGMOID,
+    //    OP_GEMM,
+    //    OP_GRU,
+    //    OP_HARDSIGMOID,
     OP_HARDSWISH,
     OP_INPUT,
     OP_INSTANCENORM,
     OP_INTERP,
-//    OP_LOGICAL,
-//    OP_LOGISTIC,
-//    OP_LRN,
-//    OP_LSTM,
-//    OP_MATMUL,
-//    OP_MAXIMUM,
-//    OP_MEAN,
-//    OP_MINIMUM,
-//    OP_MVN,
-//    OP_NOOP,
-//    OP_NORMALIZE,
-//    OP_PAD,
+    //    OP_LOGICAL,
+    //    OP_LOGISTIC,
+    //    OP_LRN,
+    //    OP_LSTM,
+    //    OP_MATMUL,
+    //    OP_MAXIMUM,
+    //    OP_MEAN,
+    //    OP_MINIMUM,
+    //    OP_MVN,
+    //    OP_NOOP,
+    //    OP_NORMALIZE,
+    //    OP_PAD,
     OP_PERMUTE,
     OP_POOL,
     OP_PRELU,
-//    OP_PRIORBOX,
-//    OP_PSROIPOOLING,
-//    OP_REDUCEL2,
-//    OP_REDUCTION,
-//    OP_REGION,
+    //    OP_PRIORBOX,
+    //    OP_PSROIPOOLING,
+    //    OP_REDUCEL2,
+    //    OP_REDUCTION,
+    //    OP_REGION,
     OP_RELU,
     OP_RELU6,
-//    OP_REORG,
+    //    OP_REORG,
     OP_RESHAPE,
     OP_RESIZE,
-//    OP_REVERSE,
-//    OP_RNN,
-//    OP_ROIALIGN,
-//    OP_ROIPOOLING,
-//    OP_ROUND,
-//    OP_RPN,
+    //    OP_REVERSE,
+    //    OP_RNN,
+    //    OP_ROIALIGN,
+    //    OP_ROIPOOLING,
+    //    OP_ROUND,
+    //    OP_RPN,
     OP_SCALE,
-//    OP_SELU,
-//    OP_SHUFFLECHANNEL,
+    //    OP_SELU,
+    //    OP_SHUFFLECHANNEL,
     OP_SIGMOID,
     OP_SLICE,
     OP_SOFTMAX,
-//    OP_SPACETOBATCHND,
+    //    OP_SPACETOBATCHND,
     OP_SPACETODEPTH,
-//    OP_SPARSETODENSE,
+    //    OP_SPARSETODENSE,
     OP_SPLIT,
-//    OP_SQUAREDDIFFERENCE,
-//    OP_SQUEEZE,
-//    OP_STRIDED_SLICE,
-//    OP_SWAP_AXIS,
+    //    OP_SQUAREDDIFFERENCE,
+    //    OP_SQUEEZE,
+    //    OP_STRIDED_SLICE,
+    //    OP_SWAP_AXIS,
     OP_TANH,
-//    OP_THRESHOLD,
-//    OP_TOPKV2,
+    //    OP_THRESHOLD,
+    //    OP_TOPKV2,
     OP_TRANSPOSE,
-//    OP_UNARY,
-//    OP_UNSQUEEZE,
+    //    OP_UNARY,
+    //    OP_UNSQUEEZE,
     OP_UPSAMPLE,
-//    OP_ZEROSLIKE,
+    //    OP_ZEROSLIKE,
     OP_MISH,
-//    OP_LOGSOFTMAX,
-//    OP_RELU1,
-//    OP_L2NORMALIZATION,
-//    OP_L2POOL,
-//    OP_TILE,
-//    OP_SHAPE,
-//    OP_SCATTER,
-//    OP_WHERE,
-//    OP_SOFTPLUS,
-//    OP_RECIPROCAL,
-//    OP_BUILTIN_LAST
+    //    OP_LOGSOFTMAX,
+    //    OP_RELU1,
+    //    OP_L2NORMALIZATION,
+    //    OP_L2POOL,
+    //    OP_TILE,
+    //    OP_SHAPE,
+    //    OP_SCATTER,
+    //    OP_WHERE,
+    //    OP_SOFTPLUS,
+    //    OP_RECIPROCAL,
+    //    OP_BUILTIN_LAST
 };
diff --git a/source/device/vulkan/layer/concat_vulkan.cpp b/source/device/vulkan/layer/concat_vulkan.cpp
index 926e7b19a..99357ba52 100644
--- a/source/device/vulkan/layer/concat_vulkan.cpp
+++ b/source/device/vulkan/layer/concat_vulkan.cpp
@@ -82,52 +82,58 @@ Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    for(int i = 0; i < ir_node->input_num; i++)
+    for (int i = 0; i < ir_node->input_num; i++)
     {
-        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]);
         std::string name = input->name;
         bottoms.push_back(name);
     }
 
-    for(int i = 0; i < ir_node->output_num; i++)
+    for (int i = 0; i < ir_node->output_num; i++)
     {
-        struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]);
         std::string name = output->name;
         tops.push_back(name);
     }
 
     // params
-    struct tensor *input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
-    struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    input_c = input_tensor->dims[1];   // param->input_channel;
+    struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    input_c = input_tensor->dims[1]; // param->input_channel;
     input_h = input_tensor->dims[2];
     input_w = input_tensor->dims[3];
-    output_c = output_tensor->dims[1];  // param->output_channel;
+    output_c = output_tensor->dims[1]; // param->output_channel;
     output_h = output_tensor->dims[2];
     output_w = output_tensor->dims[3];
 
-    struct concat_param *param = (struct concat_param *)ir_node->op.param_mem;
-    axis = param->axis -1;
+    struct concat_param* param = (struct concat_param*)ir_node->op.param_mem;
+    axis = param->axis - 1;
 }
 
 int Concat_vulkan::create_pipeline(const Option& _opt)
 {
     Option opt = _opt;
 
-    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0);        // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
     const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
 
     int out_elempack = 1;
-    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
-    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
-    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4
+                                                                                                                    : 1;
+    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4
+                                                                                                                    : 1;
+    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4
+                                                                                                                    : 1;
 
     int elempack = 1;
     if (axis == 0)
     {
-        if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-        if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-        if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+        if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                        : 1;
+        if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                        : 1;
+        if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                        : 1;
 
         // TODO fix other input data shape to set elempack
         // for (size_t b = 1; b < bottom_shapes.size(); b++)
@@ -328,7 +334,8 @@ int Concat_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, st
             top_w += bottom_blob.w * bottom_blob.elempack;
         }
 
-        int out_elempack = opt.use_shader_pack8 && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1;
+        int out_elempack = opt.use_shader_pack8 && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4
+                                                                                       : 1;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (opt.use_fp16_packed && !opt.use_fp16_storage)
@@ -430,7 +437,8 @@ int Concat_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, st
             top_h += bottom_blob.h * bottom_blob.elempack;
         }
 
-        int out_elempack = opt.use_shader_pack8 && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1;
+        int out_elempack = opt.use_shader_pack8 && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4
+                                                                                       : 1;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (opt.use_fp16_packed && !opt.use_fp16_storage)
@@ -557,9 +565,9 @@ int Concat_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, st
             constants[9].i = top_blob.cstep;
             constants[10].i = woffset;
 
-            const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2]
+            const Pipeline* pipeline = elempack == 8   ? pipeline_concat_pack8[b % 2]
                                        : elempack == 4 ? pipeline_concat_pack4[b % 2]
-                                       : pipeline_concat[b % 2];
+                                                       : pipeline_concat[b % 2];
 
             cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);
 
@@ -587,7 +595,8 @@ int Concat_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, st
             top_channels += bottom_blob.c * bottom_blob.elempack;
         }
 
-        int out_elempack = opt.use_shader_pack8 && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1;
+        int out_elempack = opt.use_shader_pack8 && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4
+                                                                                                     : 1;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (opt.use_fp16_packed && !opt.use_fp16_storage)
@@ -715,9 +724,9 @@ int Concat_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, st
             constants[9].i = top_blob.cstep;
             constants[10].i = hoffset;
 
-            const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2]
+            const Pipeline* pipeline = elempack == 8   ? pipeline_concat_pack8[b % 2]
                                        : elempack == 4 ? pipeline_concat_pack4[b % 2]
-                                       : pipeline_concat[b % 2];
+                                                       : pipeline_concat[b % 2];
 
             cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);
 
@@ -770,9 +779,9 @@ int Concat_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, st
             constants[9].i = top_blob.cstep;
             constants[10].i = woffset;
 
-            const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2]
+            const Pipeline* pipeline = elempack == 8   ? pipeline_concat_pack8[b % 2]
                                        : elempack == 4 ? pipeline_concat_pack4[b % 2]
-                                       : pipeline_concat[b % 2];
+                                                       : pipeline_concat[b % 2];
 
             cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);
 
@@ -785,4 +794,4 @@ int Concat_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, st
     return 0;
 }
 
-}   // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/concat_vulkan.hpp b/source/device/vulkan/layer/concat_vulkan.hpp
index 6476fc997..b03d8efe6 100644
--- a/source/device/vulkan/layer/concat_vulkan.hpp
+++ b/source/device/vulkan/layer/concat_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "concat_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class Concat_vulkan : public Layer
 {
@@ -55,7 +55,7 @@ class Concat_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -76,6 +76,6 @@ class Concat_vulkan : public Layer
     int axis;
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/convolution_vulkan.cpp b/source/device/vulkan/layer/convolution_vulkan.cpp
index 5f135feba..d1c7335b6 100644
--- a/source/device/vulkan/layer/convolution_vulkan.cpp
+++ b/source/device/vulkan/layer/convolution_vulkan.cpp
@@ -70,27 +70,27 @@ Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
     // Tensor* output_tensor = t_node->GetOutputTensor(0);
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
     // Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
     // ConvParam* param = conv_op->GetParam();
-    struct conv_param *param = (struct conv_param *)ir_node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)ir_node->op.param_mem;
 
     group = param->group;
-    input_c = input->dims[1];   // param->input_channel;
+    input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
-    pad_w0 = param->pad_w0;    // left padding columns
-    pad_w1 = param->pad_w1;    // right padding columns
-    pad_h0 = param->pad_h0;    // top padding rows
-    pad_h1 = param->pad_h1;    // bottom padding rows
+    pad_w0 = param->pad_w0; // left padding columns
+    pad_w1 = param->pad_w1; // right padding columns
+    pad_h0 = param->pad_h0; // top padding rows
+    pad_h1 = param->pad_h1; // bottom padding rows
     stride_w = param->stride_w;
     stride_h = param->stride_h;
     dilation_w = param->dilation_w;
@@ -98,10 +98,10 @@ Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     kernel_w = param->kernel_w;
     kernel_h = param->kernel_h;
     activation = param->activation == 0 ? 1 : -1;
-    output_c = output->dims[1];  // param->output_channel;
+    output_c = output->dims[1]; // param->output_channel;
     output_h = output->dims[2];
     output_w = output->dims[3];
-    struct tensor *weight = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    struct tensor* weight = get_ir_graph_tensor(graph, node->input_tensors[1]);
     weight_data_size = weight->elem_num;
 }
 
@@ -150,7 +150,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
             shape_bordered = Tshape(shape.w + pad_left + pad_right, shape.h + pad_top + pad_bottom, shape.c);
         }
         else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
-            || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234))
+                 || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234))
         {
             const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
             const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
@@ -168,8 +168,10 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         }
     }
 
-    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
-    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4
+                                                                                       : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4
+                                                                                             : 1;
 
     size_t elemsize;
     size_t out_elemsize;
@@ -234,28 +236,28 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
 
         padding->create_pipeline(opt);
     }
-    
+
     std::vector<vk_specialization_type> specializations(10 + 10);
-    specializations[0].i = kernel_w;	// kernel_w;
-    specializations[1].i = kernel_h;	// kernel_h
-    specializations[2].i = dilation_w;	// dilation_w;
-    specializations[3].i = dilation_h;	// dilation_h;
-    specializations[4].i = stride_w;	// stride_w;
-    specializations[5].i = stride_h;	// stride_h;
-    specializations[6].i = node->input_num>2 ? 1 : 0; // bias_term;
-    specializations[7].i = activation;	// activation_type;
-    specializations[8].f = 0;//param->activation;	// activation_params.w >= 1 ? activation_params[0] : 0.f;
-    specializations[9].f = 0;//param->activation; 	// activation_params.w == 2 ? activation_params[1] : 0.f;
-    specializations[10 + 0].i = 0;//3;	// shape_bordered_packed.dims;
-    specializations[10 + 1].i = 0;//input_w + pad_w0 + pad_w1;	// shape_bordered_packed.w;
-    specializations[10 + 2].i = 0;//input_h + pad_h0 + pad_h1;	// shape_bordered_packed.h;
-    specializations[10 + 3].i = 0;//input_c;	// shape_bordered_packed.c;
-    specializations[10 + 4].i = 0;//(input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1);	// shape_bordered_packed.cstep;
-    specializations[10 + 5].i = 0;	// out_shape_packed.dims;
-    specializations[10 + 6].i = 0;//output_w;	// out_shape_packed.w;
-    specializations[10 + 7].i = 0;//output_h;	// out_shape_packed.h;
-    specializations[10 + 8].i = 0;//output_c;	// out_shape_packed.c;
-    specializations[10 + 9].i = 0;//output_w * output_h;	// out_shape_packed.cstep;
+    specializations[0].i = kernel_w;                    // kernel_w;
+    specializations[1].i = kernel_h;                    // kernel_h
+    specializations[2].i = dilation_w;                  // dilation_w;
+    specializations[3].i = dilation_h;                  // dilation_h;
+    specializations[4].i = stride_w;                    // stride_w;
+    specializations[5].i = stride_h;                    // stride_h;
+    specializations[6].i = node->input_num > 2 ? 1 : 0; // bias_term;
+    specializations[7].i = activation;                  // activation_type;
+    specializations[8].f = 0;                           //param->activation;	// activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[9].f = 0;                           //param->activation; 	// activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[10 + 0].i = 0;                      //3;	// shape_bordered_packed.dims;
+    specializations[10 + 1].i = 0;                      //input_w + pad_w0 + pad_w1;	// shape_bordered_packed.w;
+    specializations[10 + 2].i = 0;                      //input_h + pad_h0 + pad_h1;	// shape_bordered_packed.h;
+    specializations[10 + 3].i = 0;                      //input_c;	// shape_bordered_packed.c;
+    specializations[10 + 4].i = 0;                      //(input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1);	// shape_bordered_packed.cstep;
+    specializations[10 + 5].i = 0;                      // out_shape_packed.dims;
+    specializations[10 + 6].i = 0;                      //output_w;	// out_shape_packed.w;
+    specializations[10 + 7].i = 0;                      //output_h;	// out_shape_packed.h;
+    specializations[10 + 8].i = 0;                      //output_c;	// out_shape_packed.c;
+    specializations[10 + 9].i = 0;                      //output_w * output_h;	// out_shape_packed.cstep;
 
     // TODO with local_size_xyz and shader_index options
 
@@ -263,9 +265,8 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
     local_size_xyz.w = std::min(8, out_shape_packed.w);
     local_size_xyz.h = std::min(8, out_shape_packed.h);
     local_size_xyz.c = std::min(4, out_shape_packed.c);
-    
-    // TLOG_INFO("create pipeline elempack out_elempack:%d %d\n", elempack, out_elempack);
 
+    // TLOG_INFO("create pipeline elempack out_elempack:%d %d\n", elempack, out_elempack);
 
     if (elempack == 1 && out_elempack == 1)
     {
@@ -384,7 +385,7 @@ int Convolution_vulkan::destroy_pipeline(const Option& /*opt*/)
 }
 
 int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
-{   
+{
     tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
 
     // Tensor weight_data = Tensor(weight_tensor->elem_num, 1, 1, weight_tensor->data);
@@ -399,9 +400,11 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
     int num_output = output_c;
     int num_input = input_c; //weight_data_size / maxk / num_output;
 
-    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4
+                                                                                       : 1;
     // int elempack = 1;
-    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4
+                                                                                             : 1;
 
     // TLOG_INFO("conv upload model pack:%d %d\n", elempack, out_elempack);
 
@@ -409,25 +412,24 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
     {
         Tensor weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
 
-        weight_data_packed.create(maxk, num_input/elempack, num_output/out_elempack, (size_t)4*elempack*out_elempack, elempack*out_elempack);
-        for (int q=0; q+(out_elempack-1)<num_output; q+=out_elempack)
+        weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
         {
-            Tensor g0 = weight_data_packed.channel(q/out_elempack);
+            Tensor g0 = weight_data_packed.channel(q / out_elempack);
 
-            for (int p=0; p+(elempack-1)<num_input; p+=elempack)
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
             {
-                float* g00 = g0.row(p/elempack);
+                float* g00 = g0.row(p / elempack);
 
-                for (int k=0; k<maxk; k++)
+                for (int k = 0; k < maxk; k++)
                 {
-
-                    for (int i=0; i<out_elempack; i++)
+                    for (int i = 0; i < out_elempack; i++)
                     {
-                        const Tensor k0 = weight_data_r2.channel(q+i);
+                        const Tensor k0 = weight_data_r2.channel(q + i);
 
-                        for (int j=0; j<elempack; j++)
+                        for (int j = 0; j < elempack; j++)
                         {
-                            const float* k00 = k0.row(p+j);
+                            const float* k00 = k0.row(p + j);
 
                             g00[0] = k00[k];
 
@@ -452,7 +454,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
     }
 
     // upload bias data
-    if(node->input_num > 2)
+    if (node->input_num > 2)
     {
         tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);
         Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data);
@@ -470,7 +472,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
         {
             cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
         }
-
     }
 
     // if (innerproduct)
@@ -492,7 +493,7 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t
         bottom_blob_dim3.w = 1;
         bottom_blob_dim3.cstep = 1;
     }
-    
+
     int w = bottom_blob_dim3.w;
     int h = bottom_blob_dim3.h;
     int channels = bottom_blob_dim3.c;
@@ -500,7 +501,8 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t
     int elempack = bottom_blob_dim3.elempack;
     // TLOG_INFO("botom shape:%d %d %d %d %d %d %d\n", bottom_blob.dims, bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.elemsize, bottom_blob.elempack, bottom_blob.cstep);
 
-    int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4
+                                                                                         : 1;
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     VkTensor bottom_blob_bordered = bottom_blob_dim3;
@@ -551,7 +553,7 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t
         dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
         dispatcher.h = 1;
         dispatcher.c = top_blob.c;
-        
+
         cmd.record_pipeline(pipeline_convolution_pack4_1x1s1d1, bindings, constants, dispatcher);
     }
     else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
@@ -609,7 +611,7 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t
 
     // TLOG_INFO("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w);
     // cmd.record_pipeline(pipeline_convolution, bindings, constants, top_blob);
-	// TLOG_INFO("run record convolution\n");
+    // TLOG_INFO("run record convolution\n");
     return 0;
 }
 
diff --git a/source/device/vulkan/layer/convolution_vulkan.hpp b/source/device/vulkan/layer/convolution_vulkan.hpp
index a1e7c1ad8..c0799f877 100644
--- a/source/device/vulkan/layer/convolution_vulkan.hpp
+++ b/source/device/vulkan/layer/convolution_vulkan.hpp
@@ -63,16 +63,15 @@ class Convolution_vulkan : public Layer
     // virtual int record_pipeline(VkCompute& cmd, const Option& opt) const;
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
 
-
 public:
     int group;
     int input_c;
     int input_h;
     int input_w;
-    int pad_w0;  // left padding columns
-    int pad_w1;  // right padding columns
-    int pad_h0;  // top padding rows
-    int pad_h1;  // bottom padding rows
+    int pad_w0; // left padding columns
+    int pad_w1; // right padding columns
+    int pad_h0; // top padding rows
+    int pad_h1; // bottom padding rows
     int stride_h;
     int stride_w;
     int dilation_h;
@@ -111,5 +110,4 @@ class Convolution_vulkan : public Layer
 
 } // namespace TEngine
 
-
 #endif
diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp
index bc950cf38..51f83b773 100644
--- a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp
+++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp
@@ -42,52 +42,52 @@
 
 namespace TEngine {
 
-    ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
-    {
-        support_vulkan = true;
-        pipeline_convolutiondepthwise = 0;
-    }
+ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
+{
+    support_vulkan = true;
+    pipeline_convolutiondepthwise = 0;
+}
 
-    ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-    {
-        support_vulkan = true;
+ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
 
-        padding = 0;
+    padding = 0;
 
-        pipeline_convolutiondepthwise = 0;
-        pipeline_convolutiondepthwise_pack4 = 0;
-        pipeline_convolutiondepthwise_pack8 = 0;
-        graph = ir_graph;
-        node = ir_node;
-
-        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
-        std::string name = input->name;
-        bottoms.push_back(name);
-
-        struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
-        name = output->name;
-        tops.push_back(name);
-
-        struct conv_param *param = (struct conv_param *)ir_node->op.param_mem;
-
-        group = param->group;
-        input_c = input->dims[1];   // param->input_channel;
-        input_h = input->dims[2];
-        input_w = input->dims[3];
-        pad_w0 = param->pad_w0;    // left padding columns
-        pad_w1 = param->pad_w1;    // right padding columns
-        pad_h0 = param->pad_h0;    // top padding rows
-        pad_h1 = param->pad_h1;    // bottom padding rows
-        stride_w = param->stride_w;
-        stride_h = param->stride_h;
-        dilation_w = param->dilation_w;
-        dilation_h = param->dilation_h;
-        kernel_w = param->kernel_w;
-        kernel_h = param->kernel_h;
-        output_c = output->dims[1];  // param->output_channel;
-        output_h = output->dims[2];
-        output_w = output->dims[3];
-    }
+    pipeline_convolutiondepthwise = 0;
+    pipeline_convolutiondepthwise_pack4 = 0;
+    pipeline_convolutiondepthwise_pack8 = 0;
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    struct conv_param* param = (struct conv_param*)ir_node->op.param_mem;
+
+    group = param->group;
+    input_c = input->dims[1]; // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    pad_w0 = param->pad_w0; // left padding columns
+    pad_w1 = param->pad_w1; // right padding columns
+    pad_h0 = param->pad_h0; // top padding rows
+    pad_h1 = param->pad_h1; // bottom padding rows
+    stride_w = param->stride_w;
+    stride_h = param->stride_h;
+    dilation_w = param->dilation_w;
+    dilation_h = param->dilation_h;
+    kernel_w = param->kernel_w;
+    kernel_h = param->kernel_h;
+    output_c = output->dims[1]; // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+}
 
 int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
 {
@@ -114,13 +114,14 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
         padding->create_pipeline(opt);
     }
 
-
     // const int maxk = kernel_w * kernel_h;
     int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group;
     int num_output = output_c;
 
-    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
-    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4
+                                                                                     : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4
+                                                                                             : 1;
 
     size_t elemsize;
     size_t out_elemsize;
@@ -141,27 +142,27 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
     }
 
     std::vector<vk_specialization_type> specializations(11 + 10);
-    specializations[0].i = kernel_w;	// kernel_w;
-    specializations[1].i = kernel_h;	// kernel_h
-    specializations[2].i = dilation_w;	// dilation_w;
-    specializations[3].i = dilation_h;	// dilation_h;
-    specializations[4].i = stride_w;	// stride_w;
-    specializations[5].i = stride_h;	// stride_h;
-    specializations[6].i = node->input_num >2 ? 1 : 0; // bias_term;
+    specializations[0].i = kernel_w;                    // kernel_w;
+    specializations[1].i = kernel_h;                    // kernel_h
+    specializations[2].i = dilation_w;                  // dilation_w;
+    specializations[3].i = dilation_h;                  // dilation_h;
+    specializations[4].i = stride_w;                    // stride_w;
+    specializations[5].i = stride_h;                    // stride_h;
+    specializations[6].i = node->input_num > 2 ? 1 : 0; // bias_term;
     specializations[7].i = group;
-    specializations[8].i = 1;//param->activation;	// activation_type;
-    specializations[9].f = 0;//param->activation;	// activation_params.w >= 1 ? activation_params[0] : 0.f;
-    specializations[10].f = 0;//param->activation; 	// activation_params.w == 2 ? activation_params[1] : 0.f;
-    specializations[11 + 0].i = 0;  // 3;	// shape_bordered_packed.dims;
-    specializations[11 + 1].i = 0;  // input_w + pad_w0 + pad_w1;	// shape_bordered_packed.w;
-    specializations[11 + 2].i = 0;  // input_h + pad_h0 + pad_h1;	// shape_bordered_packed.h;
-    specializations[11 + 3].i = 0;  // input_c;	// shape_bordered_packed.c;
-    specializations[11 + 4].i = 0;  // (input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1);	// shape_bordered_packed.cstep;
-    specializations[11 + 5].i = 0;  // 3;	// out_shape_packed.dims;
-    specializations[11 + 6].i = 0;  // output_w;	// out_shape_packed.w;
-    specializations[11 + 7].i = 0;  // output_h;	// out_shape_packed.h;
-    specializations[11 + 8].i = 0;  // output_c;	// out_shape_packed.c;
-    specializations[11 + 9].i = 0;  // output_w * output_h;	// out_shape_packed.cstep;
+    specializations[8].i = 1;      //param->activation;	// activation_type;
+    specializations[9].f = 0;      //param->activation;	// activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[10].f = 0;     //param->activation; 	// activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[11 + 0].i = 0; // 3;	// shape_bordered_packed.dims;
+    specializations[11 + 1].i = 0; // input_w + pad_w0 + pad_w1;	// shape_bordered_packed.w;
+    specializations[11 + 2].i = 0; // input_h + pad_h0 + pad_h1;	// shape_bordered_packed.h;
+    specializations[11 + 3].i = 0; // input_c;	// shape_bordered_packed.c;
+    specializations[11 + 4].i = 0; // (input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1);	// shape_bordered_packed.cstep;
+    specializations[11 + 5].i = 0; // 3;	// out_shape_packed.dims;
+    specializations[11 + 6].i = 0; // output_w;	// out_shape_packed.w;
+    specializations[11 + 7].i = 0; // output_h;	// out_shape_packed.h;
+    specializations[11 + 8].i = 0; // output_c;	// out_shape_packed.c;
+    specializations[11 + 9].i = 0; // output_w * output_h;	// out_shape_packed.cstep;
 
     VkTensor local_size_xyz;
     local_size_xyz.w = std::min(4, output_w);
@@ -217,14 +218,15 @@ int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)
 
 int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
-        // upload kernel data
+    // upload kernel data
     const int maxk = kernel_w * kernel_h;
     int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group;
     int num_output = output_c;
 
-    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
-    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
-
+    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4
+                                                                                     : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4
+                                                                                             : 1;
 
     tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
     Tensor weight_data = Tensor(weight_tensor->elem_num, weight_tensor->data);
@@ -236,13 +238,13 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt
     cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
 
     // upload bias data
-    if(node->input_num > 2)
+    if (node->input_num > 2)
     {
         tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);
         Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data);
         Tensor bias_data_packed;
         convert_packing(bias_data, bias_data_packed, out_elempack);
-	    cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
+        cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
     }
     return 0;
 }
@@ -255,7 +257,6 @@ int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, Vk
     size_t elemsize = bottom_blob.elemsize;
     int elempack = bottom_blob.elempack;
 
-
     VkTensor bottom_blob_bordered = bottom_blob;
     if (pad_h0 > 0 || pad_h1 > 0 || pad_w0 > 0 || pad_w1 > 0)
     {
@@ -268,7 +269,7 @@ int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, Vk
         padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
     }
 
-    top_blob.create(output_w, output_h, output_c/elempack, elemsize, elempack, opt.blob_vkallocator);
+    top_blob.create(output_w, output_h, output_c / elempack, elemsize, elempack, opt.blob_vkallocator);
 
     std::vector<VkTensor> bindings(4);
     bindings[0] = bottom_blob_bordered;
@@ -289,13 +290,13 @@ int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, Vk
     constants[9].i = top_blob.cstep;
 
     // printf("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w);
-    const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8
-                                   : elempack == 4 ? pipeline_convolutiondepthwise_pack4
-                                   : pipeline_convolutiondepthwise;
+    const Pipeline* pipeline = elempack == 8   ? pipeline_convolutiondepthwise_pack8
+                               : elempack == 4 ? pipeline_convolutiondepthwise_pack4
+                                               : pipeline_convolutiondepthwise;
 
     cmd.record_pipeline(pipeline, bindings, constants, top_blob);
 
     return 0;
 }
 
-}
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp
index 05f78f22c..7b867529b 100644
--- a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp
+++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp
@@ -65,10 +65,10 @@ class ConvolutionDepthWise_vulkan : public Layer
     int input_c;
     int input_h;
     int input_w;
-    int pad_w0;  // left padding columns
-    int pad_w1;  // right padding columns
-    int pad_h0;  // top padding rows
-    int pad_h1;  // bottom padding rows
+    int pad_w0; // left padding columns
+    int pad_w1; // right padding columns
+    int pad_h0; // top padding rows
+    int pad_h1; // bottom padding rows
     int stride_h;
     int stride_w;
     int dilation_h;
@@ -92,5 +92,4 @@ class ConvolutionDepthWise_vulkan : public Layer
 
 } // namespace TEngine
 
-
 #endif
diff --git a/source/device/vulkan/layer/crop_vulkan.cpp b/source/device/vulkan/layer/crop_vulkan.cpp
index 26f8768e8..d00325e34 100644
--- a/source/device/vulkan/layer/crop_vulkan.cpp
+++ b/source/device/vulkan/layer/crop_vulkan.cpp
@@ -76,36 +76,36 @@ Crop_vulkan::Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    for(int i = 0; i < ir_node->input_num; i++)
+    for (int i = 0; i < ir_node->input_num; i++)
     {
-        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]);
         std::string name = input->name;
         bottoms.push_back(name);
     }
 
-    for(int i = 0; i < ir_node->output_num; i++)
+    for (int i = 0; i < ir_node->output_num; i++)
     {
-        struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]);
         std::string name = output->name;
         tops.push_back(name);
     }
 
     // params
-    struct tensor *input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
-    struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    input_c = input_tensor->dims[1];   // param->input_channel;
+    struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    input_c = input_tensor->dims[1]; // param->input_channel;
     input_h = input_tensor->dims[2];
     input_w = input_tensor->dims[3];
-    output_c = output_tensor->dims[1];  // param->output_channel;
+    output_c = output_tensor->dims[1]; // param->output_channel;
     output_h = output_tensor->dims[2];
     output_w = output_tensor->dims[3];
 
-    struct crop_param *param = (struct crop_param *)ir_node->op.param_mem;
+    struct crop_param* param = (struct crop_param*)ir_node->op.param_mem;
 
     int num_args = param->num_args;
-    int offset_c = 0;   // param->offset_c;
-    int offset_h = 0;   // param->offset_h;
-    int offset_w = 0;   // param->offset_w;
+    int offset_c = 0; // param->offset_c;
+    int offset_h = 0; // param->offset_h;
+    int offset_w = 0; // param->offset_w;
     int crop_h = param->crop_h;
     int crop_w = param->crop_w;
     int center_crop = param->center_crop;
@@ -117,27 +117,34 @@ int Crop_vulkan::create_pipeline(const Option& _opt)
 {
     Option opt = _opt;
 
-    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0);        // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
     const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
 
     int elempack = 1;
-    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                    : 1;
 
     int out_elempack = 1;
-    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
-    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
-    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4
+                                                                                                                    : 1;
+    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4
+                                                                                                                    : 1;
+    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4
+                                                                                                                    : 1;
 
     int offset_elempack = 1;
-    
+
     {
         // TODO vec and image crop
         if (offset_c == 0)
             offset_elempack = elempack;
         else
-            offset_elempack = opt.use_shader_pack8 && offset_c % 8 == 0 ? 8 : offset_c % 4 == 0 ? 4 : 1;
+            offset_elempack = opt.use_shader_pack8 && offset_c % 8 == 0 ? 8 : offset_c % 4 == 0 ? 4
+                                                                                                : 1;
     }
 
     size_t elemsize;
@@ -192,16 +199,16 @@ int Crop_vulkan::create_pipeline(const Option& _opt)
 
     std::vector<vk_specialization_type> specializations(1 + 10);
     specializations[0].i = vkdev->info.bug_implicit_fp16_arithmetic;
-    specializations[1 + 0].i = 0;   // shape_unpacked.dims;
-    specializations[1 + 1].i = 0;   // shape_unpacked.w;
-    specializations[1 + 2].i = 0;   // shape_unpacked.h;
-    specializations[1 + 3].i = 0;   // shape_unpacked.c;
-    specializations[1 + 4].i = 0;   // shape_unpacked.cstep;
-    specializations[1 + 5].i = 0;   // out_shape_packed.dims;
-    specializations[1 + 6].i = 0;   // out_shape_packed.w;
-    specializations[1 + 7].i = 0;   // out_shape_packed.h;
-    specializations[1 + 8].i = 0;   // out_shape_packed.c;
-    specializations[1 + 9].i = 0;   // out_shape_packed.cstep;
+    specializations[1 + 0].i = 0; // shape_unpacked.dims;
+    specializations[1 + 1].i = 0; // shape_unpacked.w;
+    specializations[1 + 2].i = 0; // shape_unpacked.h;
+    specializations[1 + 3].i = 0; // shape_unpacked.c;
+    specializations[1 + 4].i = 0; // shape_unpacked.cstep;
+    specializations[1 + 5].i = 0; // out_shape_packed.dims;
+    specializations[1 + 6].i = 0; // out_shape_packed.w;
+    specializations[1 + 7].i = 0; // out_shape_packed.h;
+    specializations[1 + 8].i = 0; // out_shape_packed.c;
+    specializations[1 + 9].i = 0; // out_shape_packed.cstep;
 
     Tensor local_size_xyz;
     if (out_shape_packed.dims == 1)
@@ -295,7 +302,6 @@ int Crop_vulkan::create_pipeline(const Option& _opt)
         pipeline_crop_pack8to1->create(LayerShaderType::crop_pack8to1, opt, specializations);
     }
 
-   
     return 0;
 }
 
@@ -357,9 +363,12 @@ int Crop_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob
             return 0;
         }
 
-        int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1;
+        int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8
+                                                     : _coffset % 4 == 0                           ? 4
+                                                                                                   : 1;
 
-        int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
+        int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4
+                                                                                       : 1;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (opt.use_fp16_packed && !opt.use_fp16_storage)
@@ -483,9 +492,9 @@ int Crop_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std:
     _outw = output_w;
     _outh = output_h;
     _outc = output_c;
-    _woffset = 0;   // offset_w;
-    _hoffset = 0;   // offset_h;
-    _coffset = 0;   // offset_c;
+    _woffset = 0; // offset_w;
+    _hoffset = 0; // offset_h;
+    _coffset = 0; // offset_c;
 
     // TODO vec and image crop
 
@@ -497,9 +506,12 @@ int Crop_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std:
             return 0;
         }
 
-        int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1;
+        int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8
+                                                     : _coffset % 4 == 0                           ? 4
+                                                                                                   : 1;
 
-        int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
+        int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4
+                                                                                       : 1;
         size_t out_elemsize = elemsize / elempack * out_elempack;
 
         if (opt.use_fp16_packed && !opt.use_fp16_storage)
@@ -604,4 +616,4 @@ int Crop_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std:
     return 0;
 }
 
-}   // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/crop_vulkan.hpp b/source/device/vulkan/layer/crop_vulkan.hpp
index 1a55f3ca1..2316f07c0 100644
--- a/source/device/vulkan/layer/crop_vulkan.hpp
+++ b/source/device/vulkan/layer/crop_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "crop_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class Crop_vulkan : public Layer
 {
@@ -55,7 +55,7 @@ class Crop_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     void resolve_crop_roi(const Tensor& bottom_blob, int& _woffset, int& _hoffset, int& _coffset, int& _outw, int& _outh, int& _outc) const;
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
     virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
@@ -78,7 +78,7 @@ class Crop_vulkan : public Layer
     int output_c;
     int output_h;
     int output_w;
-    
+
     int num_args;
     int offset_c;
     int offset_h;
@@ -90,6 +90,6 @@ class Crop_vulkan : public Layer
     int flag;
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/dropout_vulkan.cpp b/source/device/vulkan/layer/dropout_vulkan.cpp
index a6c3e0724..bf46fa34c 100644
--- a/source/device/vulkan/layer/dropout_vulkan.cpp
+++ b/source/device/vulkan/layer/dropout_vulkan.cpp
@@ -64,26 +64,26 @@ Dropout_vulkan::Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
     // params
-    input_c = input->dims[1];   // param->input_channel;
+    input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
-    output_c = output->dims[1];  // param->output_channel;
+    output_c = output->dims[1]; // param->output_channel;
     output_h = output->dims[2];
     output_w = output->dims[3];
 
-    if(input->scale != 0)
+    if (input->scale != 0)
         scale = input->scale;
     else
-        scale = 1.0f;      
+        scale = 1.0f;
 }
 
 int Dropout_vulkan::create_pipeline(const Option& opt)
@@ -91,9 +91,12 @@ int Dropout_vulkan::create_pipeline(const Option& opt)
     const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
 
     int elempack = 1;
-    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                    : 1;
 
     size_t elemsize;
     if (opt.use_fp16_storage)
@@ -202,15 +205,13 @@ int Dropout_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c
     constants[3].i = bottom_top_blob.c;
     constants[4].i = bottom_top_blob.cstep;
 
-    const Pipeline* pipeline = elempack == 8 ? pipeline_dropout_pack8
+    const Pipeline* pipeline = elempack == 8   ? pipeline_dropout_pack8
                                : elempack == 4 ? pipeline_dropout_pack4
-                               : pipeline_dropout;
+                                               : pipeline_dropout;
 
     cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
 
     return 0;
 }
 
-
-
-}   // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/dropout_vulkan.hpp b/source/device/vulkan/layer/dropout_vulkan.hpp
index b6e943889..478345ca7 100644
--- a/source/device/vulkan/layer/dropout_vulkan.hpp
+++ b/source/device/vulkan/layer/dropout_vulkan.hpp
@@ -43,7 +43,7 @@
 #include "../vulkan_layer.hpp"
 #include "../vulkan_command.hpp"
 
-namespace TEngine{
+namespace TEngine {
 
 class Dropout_vulkan : public Layer
 {
@@ -54,7 +54,7 @@ class Dropout_vulkan : public Layer
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
     // virtual int upload_model(VkTransfer& cmd, const Option& opt);
-    
+
     virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -70,9 +70,8 @@ class Dropout_vulkan : public Layer
     int output_h;
     int output_w;
     float scale;
-
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/eltwise_vulkan.cpp b/source/device/vulkan/layer/eltwise_vulkan.cpp
index 9fc322bc9..a8d112bf4 100644
--- a/source/device/vulkan/layer/eltwise_vulkan.cpp
+++ b/source/device/vulkan/layer/eltwise_vulkan.cpp
@@ -70,22 +70,22 @@ Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    for(int i = 0; i < ir_node->input_num; i++)
+    for (int i = 0; i < ir_node->input_num; i++)
     {
-        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]);
         std::string name = input->name;
         bottoms.push_back(name);
     }
 
-    for(int i = 0; i < ir_node->output_num; i++)
+    for (int i = 0; i < ir_node->output_num; i++)
     {
-        struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]);
         std::string name = output->name;
         tops.push_back(name);
     }
 
-    struct eltwise_param *param = (struct eltwise_param *)ir_node->op.param_mem;
-    op_type = (param -> type) / 2;
+    struct eltwise_param* param = (struct eltwise_param*)ir_node->op.param_mem;
+    op_type = (param->type) / 2;
 }
 
 int Eltwise_vulkan::create_pipeline(const Option& opt)
@@ -93,9 +93,12 @@ int Eltwise_vulkan::create_pipeline(const Option& opt)
     const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
 
     int elempack = 1;
-    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                    : 1;
 
     size_t elemsize;
     if (opt.use_fp16_storage)
@@ -118,12 +121,12 @@ int Eltwise_vulkan::create_pipeline(const Option& opt)
 
     std::vector<vk_specialization_type> specializations(2 + 5);
     specializations[0].i = op_type;
-    specializations[1].i = 0;   // coeffs.w == 0 ? 0 : 1;   TODO fix coeffs value
-    specializations[2 + 0].i = 0;   // shape_packed.dims;
-    specializations[2 + 1].i = 0;   // shape_packed.w;
-    specializations[2 + 2].i = 0;   // shape_packed.h;
-    specializations[2 + 3].i = 0;   // shape_packed.c;
-    specializations[2 + 4].i = 0;   // shape_packed.cstep;
+    specializations[1].i = 0;     // coeffs.w == 0 ? 0 : 1;   TODO fix coeffs value
+    specializations[2 + 0].i = 0; // shape_packed.dims;
+    specializations[2 + 1].i = 0; // shape_packed.w;
+    specializations[2 + 2].i = 0; // shape_packed.h;
+    specializations[2 + 3].i = 0; // shape_packed.c;
+    specializations[2 + 4].i = 0; // shape_packed.cstep;
 
     Tensor local_size_xyz;
     if (shape_packed.dims == 1)
@@ -228,12 +231,12 @@ int Eltwise_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, s
     constants[2].i = top_blob.h;
     constants[3].i = top_blob.c;
     constants[4].i = top_blob.cstep;
-    constants[5].f = 1.0f;  // coeffs.w == 0 ? 1.f : coeffs[0];     TODO fix coeffs value
-    constants[6].f = 1.0f;  // coeffs.w == 0 ? 1.f : coeffs[1];
+    constants[5].f = 1.0f; // coeffs.w == 0 ? 1.f : coeffs[0];     TODO fix coeffs value
+    constants[6].f = 1.0f; // coeffs.w == 0 ? 1.f : coeffs[1];
 
-    const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[1]
+    const Pipeline* pipeline = elempack == 8   ? pipeline_eltwise_pack8[1]
                                : elempack == 4 ? pipeline_eltwise_pack4[1]
-                               : pipeline_eltwise[1];
+                                               : pipeline_eltwise[1];
 
     cmd.record_pipeline(pipeline, bindings, constants, top_blob);
 
@@ -251,11 +254,11 @@ int Eltwise_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, s
         constants[3].i = top_blob.c;
         constants[4].i = top_blob.cstep;
         constants[5].f = 1.f;
-        constants[6].f = 1.0f;  // coeffs.w == 0 ? 1 : coeffs[b];       TODO fixcoeffs value
+        constants[6].f = 1.0f; // coeffs.w == 0 ? 1 : coeffs[b];       TODO fixcoeffs value
 
-        const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[b % 2]
+        const Pipeline* pipeline = elempack == 8   ? pipeline_eltwise_pack8[b % 2]
                                    : elempack == 4 ? pipeline_eltwise_pack4[b % 2]
-                                   : pipeline_eltwise[b % 2];
+                                                   : pipeline_eltwise[b % 2];
 
         cmd.record_pipeline(pipeline, bindings, constants, top_blob);
     }
@@ -263,4 +266,4 @@ int Eltwise_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, s
     return 0;
 }
 
-}   // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/eltwise_vulkan.hpp b/source/device/vulkan/layer/eltwise_vulkan.hpp
index 5830b076d..5830aea6a 100644
--- a/source/device/vulkan/layer/eltwise_vulkan.hpp
+++ b/source/device/vulkan/layer/eltwise_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "eltwise_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class Eltwise_vulkan : public Layer
 {
@@ -55,7 +55,7 @@ class Eltwise_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -84,7 +84,7 @@ class Eltwise_vulkan : public Layer
         ELT_SQUARE,
         ELT_POW
     };
-    int op_type;    // Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2
+    int op_type; // Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2
 
     int input_c;
     int input_h;
@@ -94,6 +94,6 @@ class Eltwise_vulkan : public Layer
     int output_w;
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/flatten_vulkan.cpp b/source/device/vulkan/layer/flatten_vulkan.cpp
index 589b7d5d4..798402f2c 100644
--- a/source/device/vulkan/layer/flatten_vulkan.cpp
+++ b/source/device/vulkan/layer/flatten_vulkan.cpp
@@ -70,22 +70,22 @@ Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
     // params
-    input_c = input->dims[1];   // param->input_channel;
+    input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
-    output_c = output->dims[1];  // param->output_channel;
+    output_c = output->dims[1]; // param->output_channel;
     output_h = output->dims[2];
     output_w = output->dims[3];
-    output_size = output->dims[3]*output->dims[2]*output->dims[1];
+    output_size = output->dims[3] * output->dims[2] * output->dims[1];
 }
 
 int Flatten_vulkan::create_pipeline(const Option& _opt)
@@ -95,14 +95,17 @@ int Flatten_vulkan::create_pipeline(const Option& _opt)
     // const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
     const Tensor& out_shape = Tensor(output_size, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
 
-
     int elempack = 1;
-    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                    : 1;
 
     int out_elempack = 1;
-    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4
+                                                                                                                    : 1;
 
     size_t elemsize;
     size_t out_elemsize;
@@ -137,16 +140,16 @@ int Flatten_vulkan::create_pipeline(const Option& _opt)
     }
 
     std::vector<vk_specialization_type> specializations(0 + 10);
-    specializations[0 + 0].i = 0;   // shape_packed.dims;
-    specializations[0 + 1].i = 0;   // shape_packed.w;
-    specializations[0 + 2].i = 0;   // shape_packed.h;
-    specializations[0 + 3].i = 0;   // shape_packed.c;
-    specializations[0 + 4].i = 0;   // shape_packed.cstep;
-    specializations[0 + 5].i = 0;   // out_shape_packed.dims;
-    specializations[0 + 6].i = 0;   // out_shape_packed.w;
-    specializations[0 + 7].i = 0;   // out_shape_packed.h;
-    specializations[0 + 8].i = 0;   // out_shape_packed.c;
-    specializations[0 + 9].i = 0;   // out_shape_packed.cstep;
+    specializations[0 + 0].i = 0; // shape_packed.dims;
+    specializations[0 + 1].i = 0; // shape_packed.w;
+    specializations[0 + 2].i = 0; // shape_packed.h;
+    specializations[0 + 3].i = 0; // shape_packed.c;
+    specializations[0 + 4].i = 0; // shape_packed.cstep;
+    specializations[0 + 5].i = 0; // out_shape_packed.dims;
+    specializations[0 + 6].i = 0; // out_shape_packed.w;
+    specializations[0 + 7].i = 0; // out_shape_packed.h;
+    specializations[0 + 8].i = 0; // out_shape_packed.c;
+    specializations[0 + 9].i = 0; // out_shape_packed.cstep;
 
     Tensor local_size_xyz(64, 1, 1, (void*)0);
     if (out_shape_packed.dims != 0)
@@ -207,8 +210,6 @@ int Flatten_vulkan::create_pipeline(const Option& _opt)
     return 0;
 }
 
-
-
 int Flatten_vulkan::destroy_pipeline(const Option& /*opt*/)
 {
     delete pipeline_flatten;
@@ -250,7 +251,8 @@ int Flatten_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
 
     int total = w * h * channels * elempack;
 
-    int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4
+                                                                                   : 1;
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     if (opt.use_fp16_packed && !opt.use_fp16_storage)
@@ -323,4 +325,4 @@ int Flatten_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     return 0;
 }
 
-}   // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/flatten_vulkan.hpp b/source/device/vulkan/layer/flatten_vulkan.hpp
index 91de06f9f..cd364ddf2 100644
--- a/source/device/vulkan/layer/flatten_vulkan.hpp
+++ b/source/device/vulkan/layer/flatten_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "flatten_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class Flatten_vulkan : public Layer
 {
@@ -55,7 +55,7 @@ class Flatten_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -74,9 +74,8 @@ class Flatten_vulkan : public Layer
     int output_h;
     int output_w;
     int output_size;
-
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/innerproduct_vulkan.cpp b/source/device/vulkan/layer/innerproduct_vulkan.cpp
index c4ba14e99..8e1d66b8a 100644
--- a/source/device/vulkan/layer/innerproduct_vulkan.cpp
+++ b/source/device/vulkan/layer/innerproduct_vulkan.cpp
@@ -80,35 +80,34 @@ InnerProduct_vulkan::InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_nod
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
-    struct fc_param *param = (struct fc_param *)ir_node->op.param_mem;
+    struct fc_param* param = (struct fc_param*)ir_node->op.param_mem;
 
     num_output = param->num_output;
-    input_c = input->dims[1];   // param->input_channel;
+    input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
-    output_c = output->dims[1];  // param->output_channel;
+    output_c = output->dims[1]; // param->output_channel;
     output_h = output->dims[2];
     output_w = output->dims[3];
 
-    struct tensor *weight = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    struct tensor* weight = get_ir_graph_tensor(graph, node->input_tensors[1]);
     weight_data_size = weight->elem_num;
 
     activation_type = -1;
-
 }
 
 int InnerProduct_vulkan::create_pipeline(const Option& _opt)
 {
     Option opt = _opt;
-    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0);        // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
     const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
 
     Tensor shape_flatten;
@@ -119,8 +118,10 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
 
     int num_input = weight_data_size / num_output;
 
-    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
-    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4
+                                                                                       : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4
+                                                                                             : 1;
 
     size_t elemsize;
     size_t out_elemsize;
@@ -161,27 +162,26 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
         flatten->output_w = shape_flatten.w;
         flatten->output_h = shape_flatten.h;
         flatten->output_c = shape_flatten.c;
-        flatten->output_size = shape_flatten.w*shape_flatten.h*shape_flatten.c;
+        flatten->output_size = shape_flatten.w * shape_flatten.h * shape_flatten.c;
 
         flatten->create_pipeline(opt);
     }
 
-
     std::vector<vk_specialization_type> specializations(4 + 10);
     specializations[0].i = bias_term;
     specializations[1].i = activation_type;
-    specializations[2].f = 0.f; // activation_params.w >= 1 ? activation_params[0] : 0.f;
-    specializations[3].f = 0.f; // activation_params.w == 2 ? activation_params[1] : 0.f;
-    specializations[4 + 0].i = 0;   // shape_flatten_packed.dims;
-    specializations[4 + 1].i = 0;   // shape_flatten_packed.w;
-    specializations[4 + 2].i = 0;   // shape_flatten_packed.h;
-    specializations[4 + 3].i = 0;   // shape_flatten_packed.c;
-    specializations[4 + 4].i = 0;   // shape_flatten_packed.cstep;
-    specializations[4 + 5].i = 0;   // out_shape_packed.dims;
-    specializations[4 + 6].i = 0;   // out_shape_packed.w;
-    specializations[4 + 7].i = 0;   // out_shape_packed.h;
-    specializations[4 + 8].i = 0;   // out_shape_packed.c;
-    specializations[4 + 9].i = 0;   // out_shape_packed.cstep;
+    specializations[2].f = 0.f;   // activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[3].f = 0.f;   // activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[4 + 0].i = 0; // shape_flatten_packed.dims;
+    specializations[4 + 1].i = 0; // shape_flatten_packed.w;
+    specializations[4 + 2].i = 0; // shape_flatten_packed.h;
+    specializations[4 + 3].i = 0; // shape_flatten_packed.c;
+    specializations[4 + 4].i = 0; // shape_flatten_packed.cstep;
+    specializations[4 + 5].i = 0; // out_shape_packed.dims;
+    specializations[4 + 6].i = 0; // out_shape_packed.w;
+    specializations[4 + 7].i = 0; // out_shape_packed.h;
+    specializations[4 + 8].i = 0; // out_shape_packed.c;
+    specializations[4 + 9].i = 0; // out_shape_packed.cstep;
 
     Tensor local_size_xyz(std::min(64, num_output / out_elempack), 1, 1, (void*)0);
     if (out_shape_packed.dims != 0)
@@ -309,8 +309,10 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
     int num_input = weight_data_size / num_output;
 
-    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
-    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4
+                                                                                       : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4
+                                                                                             : 1;
 
     // src = inch-outch
     // dst = pa-pb-inch/pa-outch/pb
@@ -386,7 +388,8 @@ int InnerProduct_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor&
     size_t elemsize = bottom_blob_flattened.elemsize;
     int elempack = bottom_blob_flattened.elempack;
 
-    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4
+                                                                                             : 1;
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
     if (opt.use_fp16_packed && !opt.use_fp16_storage)
@@ -461,4 +464,4 @@ int InnerProduct_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor&
     return 0;
 }
 
-}   // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/innerproduct_vulkan.hpp b/source/device/vulkan/layer/innerproduct_vulkan.hpp
index c682bcb46..c66c36947 100644
--- a/source/device/vulkan/layer/innerproduct_vulkan.hpp
+++ b/source/device/vulkan/layer/innerproduct_vulkan.hpp
@@ -58,7 +58,7 @@ class InnerProduct_vulkan : public Layer
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
     virtual int upload_model(VkTransfer& cmd, const Option& opt);
-    
+
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -98,6 +98,6 @@ class InnerProduct_vulkan : public Layer
     int output_w;
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
-#endif  // LAYER_INNERPRODUCT_VULKAN_H
\ No newline at end of file
+#endif // LAYER_INNERPRODUCT_VULKAN_H
\ No newline at end of file
diff --git a/source/device/vulkan/layer/interp_vulkan.cpp b/source/device/vulkan/layer/interp_vulkan.cpp
index 586846b72..81c8ae748 100644
--- a/source/device/vulkan/layer/interp_vulkan.cpp
+++ b/source/device/vulkan/layer/interp_vulkan.cpp
@@ -76,23 +76,23 @@ Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
     // params
-    input_c = input->dims[1];   // param->input_channel;
+    input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
-    output_c = output->dims[1];  // param->output_channel;
+    output_c = output->dims[1]; // param->output_channel;
     output_h = output->dims[2];
     output_w = output->dims[3];
 
-    struct interp_param *param = (struct interp_param *)ir_node->op.param_mem;
+    struct interp_param* param = (struct interp_param*)ir_node->op.param_mem;
 
     if (param->height_scale != 0 && param->width_scale != 0)
     {
@@ -101,27 +101,33 @@ Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     }
     else
     {
-        height_scale = (float )output->dims[2] / (float )input_h;
-        width_scale = (float )output->dims[2] / (float )input_w;
+        height_scale = (float)output->dims[2] / (float)input_h;
+        width_scale = (float)output->dims[2] / (float)input_w;
     }
-    resize_type = 2;//param->resize_type;
+    resize_type = 2; //param->resize_type;
 }
 
 int Interp_vulkan::create_pipeline(const Option& _opt)
 {
     Option opt = _opt;
-    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0);        // bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
 
     int elempack = 1;
-    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                    : 1;
 
     int out_elempack = 1;
-    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
-    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
-    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4
+                                                                                                                    : 1;
+    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4
+                                                                                                                    : 1;
+    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4
+                                                                                                                    : 1;
 
     size_t elemsize;
     size_t out_elemsize;
@@ -162,16 +168,16 @@ int Interp_vulkan::create_pipeline(const Option& _opt)
     {
         std::vector<vk_specialization_type> specializations(1 + 10);
         specializations[0].i = resize_type;
-        specializations[1 + 0].i = 0;   // shape_packed.dims;
-        specializations[1 + 1].i = 0;   // shape_packed.w;
-        specializations[1 + 2].i = 0;   // shape_packed.h;
-        specializations[1 + 3].i = 0;   // shape_packed.c;
-        specializations[1 + 4].i = 0;   // shape_packed.cstep;
-        specializations[1 + 5].i = 0;   // out_shape_packed.dims;
-        specializations[1 + 6].i = 0;   // out_shape_packed.w;
-        specializations[1 + 7].i = 0;   // out_shape_packed.h;
-        specializations[1 + 8].i = 0;   // out_shape_packed.c;
-        specializations[1 + 9].i = 0;   // out_shape_packed.cstep;
+        specializations[1 + 0].i = 0; // shape_packed.dims;
+        specializations[1 + 1].i = 0; // shape_packed.w;
+        specializations[1 + 2].i = 0; // shape_packed.h;
+        specializations[1 + 3].i = 0; // shape_packed.c;
+        specializations[1 + 4].i = 0; // shape_packed.cstep;
+        specializations[1 + 5].i = 0; // out_shape_packed.dims;
+        specializations[1 + 6].i = 0; // out_shape_packed.w;
+        specializations[1 + 7].i = 0; // out_shape_packed.h;
+        specializations[1 + 8].i = 0; // out_shape_packed.c;
+        specializations[1 + 9].i = 0; // out_shape_packed.cstep;
 
         Tensor local_size_xyz;
         if (out_shape_packed.dims == 2)
@@ -250,16 +256,16 @@ int Interp_vulkan::create_pipeline(const Option& _opt)
         }
 
         std::vector<vk_specialization_type> specializations(0 + 10);
-        specializations[0 + 0].i = 0;   // shape_packed.dims;
-        specializations[0 + 1].i = 0;   // shape_packed.w;
-        specializations[0 + 2].i = 0;   // shape_packed.h;
-        specializations[0 + 3].i = 0;   // shape_packed.c;
-        specializations[0 + 4].i = 0;   // shape_packed.cstep;
-        specializations[0 + 5].i = 0;   // out_shape_packed.dims;
-        specializations[0 + 6].i = 0;   // out_shape_packed.w;
-        specializations[0 + 7].i = 0;   // out_shape_packed.h;
-        specializations[0 + 8].i = 0;   // out_shape_packed.c;
-        specializations[0 + 9].i = 0;   // out_shape_packed.cstep;
+        specializations[0 + 0].i = 0; // shape_packed.dims;
+        specializations[0 + 1].i = 0; // shape_packed.w;
+        specializations[0 + 2].i = 0; // shape_packed.h;
+        specializations[0 + 3].i = 0; // shape_packed.c;
+        specializations[0 + 4].i = 0; // shape_packed.cstep;
+        specializations[0 + 5].i = 0; // out_shape_packed.dims;
+        specializations[0 + 6].i = 0; // out_shape_packed.w;
+        specializations[0 + 7].i = 0; // out_shape_packed.h;
+        specializations[0 + 8].i = 0; // out_shape_packed.c;
+        specializations[0 + 9].i = 0; // out_shape_packed.cstep;
 
         Tensor local_size_xyz;
         if (out_shape_packed.dims == 2)
@@ -378,9 +384,9 @@ int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_bl
         constants[10].f = w / (float)outw;
         constants[11].f = h / (float)outh;
 
-        const Pipeline* pipeline = elempack == 8 ? pipeline_interp_pack8
+        const Pipeline* pipeline = elempack == 8   ? pipeline_interp_pack8
                                    : elempack == 4 ? pipeline_interp_pack4
-                                   : pipeline_interp;
+                                                   : pipeline_interp;
 
         cmd.record_pipeline(pipeline, bindings, constants, top_blob);
     }
@@ -451,9 +457,9 @@ int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_bl
         constants[8].i = top_blob.c;
         constants[9].i = top_blob.cstep;
 
-        const Pipeline* pipeline = elempack == 8 ? pipeline_interp_bicubic_pack8
+        const Pipeline* pipeline = elempack == 8   ? pipeline_interp_bicubic_pack8
                                    : elempack == 4 ? pipeline_interp_bicubic_pack4
-                                   : pipeline_interp_bicubic;
+                                                   : pipeline_interp_bicubic;
 
         cmd.record_pipeline(pipeline, bindings, constants, top_blob);
     }
@@ -461,4 +467,4 @@ int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_bl
     return 0;
 }
 
-}   // TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/interp_vulkan.hpp b/source/device/vulkan/layer/interp_vulkan.hpp
index ef3886f45..98574f499 100644
--- a/source/device/vulkan/layer/interp_vulkan.hpp
+++ b/source/device/vulkan/layer/interp_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "interp_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class Interp_vulkan : public Layer
 {
@@ -56,7 +56,7 @@ class Interp_vulkan : public Layer
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
     // virtual int upload_model(VkTransfer& cmd, const Option& opt);
-    
+
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -78,15 +78,13 @@ class Interp_vulkan : public Layer
     int output_h;
     int output_w;
 
-    int resize_type;    //1=nearest  2=bilinear  3=bicubic
+    int resize_type; //1=nearest  2=bilinear  3=bicubic
     int output_height;
     int output_width;
     float height_scale;
     float width_scale;
-
-
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/packing_vulkan.cpp b/source/device/vulkan/layer/packing_vulkan.cpp
index 86a6c9538..88a6de812 100644
--- a/source/device/vulkan/layer/packing_vulkan.cpp
+++ b/source/device/vulkan/layer/packing_vulkan.cpp
@@ -60,8 +60,6 @@ Packing_vulkan::Packing_vulkan()
 
 int Packing_vulkan::create_pipeline(const Option& _opt)
 {
-    
-
     Option opt = _opt;
     // const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     // const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
@@ -90,7 +88,6 @@ int Packing_vulkan::create_pipeline(const Option& _opt)
     // if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
     // if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
     // if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
-    
 
     // check blob shape
     // if (!vkdev->shape_support_image_storage(out_shape_packed))
@@ -102,7 +99,7 @@ int Packing_vulkan::create_pipeline(const Option& _opt)
     std::vector<vk_specialization_type> specializations(2 + 10);
     specializations[0].i = storage_type_from;
     specializations[1].i = storage_type_to;
-    specializations[2 + 0].i = 0;// FIXME shape elempack may be dynamic
+    specializations[2 + 0].i = 0; // FIXME shape elempack may be dynamic
     specializations[2 + 1].i = 0;
     specializations[2 + 2].i = 0;
     specializations[2 + 3].i = 0;
@@ -112,11 +109,10 @@ int Packing_vulkan::create_pipeline(const Option& _opt)
     specializations[2 + 7].i = 0; //out_shape_packed_h;
     specializations[2 + 8].i = 0; //out_shape_packed_c;
     specializations[2 + 9].i = 0; //out_shape_packed_cstep;
-    
 
     // printf("out shape dims:%d ---------------------------------\n", out_shape_packed_dims);
 
-    VkTensor local_size_xyz;// TODO more precise group size guessed from out_shape_packed
+    VkTensor local_size_xyz; // TODO more precise group size guessed from out_shape_packed
     if (out_shape_packed_dims == 1)
     {
         local_size_xyz.w = 64;
@@ -487,7 +483,6 @@ int Packing_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
         cmd.record_pipeline(pipeline_packing_pack8to1, buffer_bindings, image_bindings, constants, bottom_blob);
     }
 
-
     // printf("run packing vulkan record pipeline\n");
     return 0;
 }
diff --git a/source/device/vulkan/layer/packing_vulkan.hpp b/source/device/vulkan/layer/packing_vulkan.hpp
index 10b748020..f528edf11 100644
--- a/source/device/vulkan/layer/packing_vulkan.hpp
+++ b/source/device/vulkan/layer/packing_vulkan.hpp
@@ -52,7 +52,7 @@ class Packing_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -92,5 +92,4 @@ class Packing_vulkan : public Layer
 
 } // namespace TEngine
 
-
 #endif
diff --git a/source/device/vulkan/layer/padding_vulkan.cpp b/source/device/vulkan/layer/padding_vulkan.cpp
index 756fb05c9..27fa57853 100644
--- a/source/device/vulkan/layer/padding_vulkan.cpp
+++ b/source/device/vulkan/layer/padding_vulkan.cpp
@@ -50,32 +50,31 @@ Padding_vulkan::Padding_vulkan()
     pipeline_padding_pack8 = 0;
 }
 
-
-
 int Padding_vulkan::create_pipeline(const Option& opt)
 {
     int elempack = 1;
-    elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 : 1;
+    elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4
+                                                                               : 1;
     int out_elempack;
-    out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1;
+    out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4
+                                                                                     : 1;
 
     // printf("create padding pipeline elempack:%d %d \n", elempack, out_elempack);
 
-
     std::vector<vk_specialization_type> specializations(3 + 10);
     specializations[0].i = type;
     specializations[1].f = value;
-    specializations[2].i = 0;   // per_channel_pad_data_size ? 1 : 0;
-    specializations[3 + 0].i = 3;   // shape_packed.dims;                                                                                       
-    specializations[3 + 1].i = input_w;   // shape_packed.w;
-    specializations[3 + 2].i = input_h;   // shape_packed.h;
-    specializations[3 + 3].i = input_c;   // shape_packed.c;
+    specializations[2].i = 0;                       // per_channel_pad_data_size ? 1 : 0;
+    specializations[3 + 0].i = 3;                   // shape_packed.dims;
+    specializations[3 + 1].i = input_w;             // shape_packed.w;
+    specializations[3 + 2].i = input_h;             // shape_packed.h;
+    specializations[3 + 3].i = input_c;             // shape_packed.c;
     specializations[3 + 4].i = input_w * input_h;   // shape_packed.cstep;
-    specializations[3 + 5].i = 3;   // out_shape_packed.dims;
-    specializations[3 + 6].i = output_w;   // out_shape_packed.w;
-    specializations[3 + 7].i = output_h;   // out_shape_packed.h;
-    specializations[3 + 8].i = output_c;   // out_shape_packed.c;
-    specializations[3 + 9].i = output_w * output_h;   // out_shape_packed.cstep;
+    specializations[3 + 5].i = 3;                   // out_shape_packed.dims;
+    specializations[3 + 6].i = output_w;            // out_shape_packed.w;
+    specializations[3 + 7].i = output_h;            // out_shape_packed.h;
+    specializations[3 + 8].i = output_c;            // out_shape_packed.c;
+    specializations[3 + 9].i = output_w * output_h; // out_shape_packed.cstep;
 
     VkTensor local_size_xyz;
     // if (out_shape_packed.dims != 0)
@@ -87,7 +86,7 @@ int Padding_vulkan::create_pipeline(const Option& opt)
 
     // pack1
     // if (shape.dims == 0 || elempack == 1)
-    if(elempack == 1)
+    if (elempack == 1)
     {
         pipeline_padding = new Pipeline(vkdev);
         pipeline_padding->set_optimal_local_size_xyz(local_size_xyz);
@@ -96,7 +95,7 @@ int Padding_vulkan::create_pipeline(const Option& opt)
 
     // pack4
     // if (shape.dims == 0 || elempack == 4)
-    if(elempack == 4)
+    if (elempack == 4)
     {
         pipeline_padding_pack4 = new Pipeline(vkdev);
         pipeline_padding_pack4->set_optimal_local_size_xyz(local_size_xyz);
@@ -111,7 +110,7 @@ int Padding_vulkan::create_pipeline(const Option& opt)
         pipeline_padding_pack8->set_optimal_local_size_xyz(local_size_xyz);
         pipeline_padding_pack8->create(LayerShaderType::padding_pack8, opt, specializations);
     }
-    
+
     return 0;
 }
 
@@ -120,7 +119,6 @@ int Padding_vulkan::destroy_pipeline(const Option& /*opt*/)
     return 0;
 }
 
-
 int Padding_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
 {
     if (top == 0 && bottom == 0 && left == 0 && right == 0)
@@ -160,11 +158,11 @@ int Padding_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     constants[9].i = top_blob.cstep;
     constants[10].i = left;
     constants[11].i = top;
-    
+
     // printf("padding shape:%d %d %d %d %d %d %d %d %d\n", top_blob.c, top_blob.h, top_blob.w, top_blob.cstep, bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.cstep, elempack);
-    const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8
-                             : elempack == 4 ? pipeline_padding_pack4
-                             : pipeline_padding;
+    const Pipeline* pipeline = elempack == 8   ? pipeline_padding_pack8
+                               : elempack == 4 ? pipeline_padding_pack4
+                                               : pipeline_padding;
 
     cmd.record_pipeline(pipeline, bindings, constants, top_blob);
 
diff --git a/source/device/vulkan/layer/padding_vulkan.hpp b/source/device/vulkan/layer/padding_vulkan.hpp
index f6aabe066..03bbce43d 100644
--- a/source/device/vulkan/layer/padding_vulkan.hpp
+++ b/source/device/vulkan/layer/padding_vulkan.hpp
@@ -52,7 +52,7 @@ class Padding_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -60,7 +60,7 @@ class Padding_vulkan : public Layer
     int bottom;
     int left;
     int right;
-    int type;// 0=CONSTANT 1=REPLICATE 2=REFLECT
+    int type; // 0=CONSTANT 1=REPLICATE 2=REFLECT
     float value;
     int input_w;
     int input_h;
@@ -77,5 +77,4 @@ class Padding_vulkan : public Layer
 
 } // namespace TEngine
 
-
 #endif
diff --git a/source/device/vulkan/layer/permute_vulkan.cpp b/source/device/vulkan/layer/permute_vulkan.cpp
index 461b3cc25..0bead6791 100644
--- a/source/device/vulkan/layer/permute_vulkan.cpp
+++ b/source/device/vulkan/layer/permute_vulkan.cpp
@@ -76,27 +76,27 @@ Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
     // params
-    input_c = input->dims[1];   // param->input_channel;
+    input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
-    output_c = output->dims[1];  // param->output_channel;
+    output_c = output->dims[1]; // param->output_channel;
     output_h = output->dims[2];
     output_w = output->dims[3];
 
     // TODO fix order_type value
-    struct permute_param *param = (struct permute_param *)ir_node->op.param_mem;
+    struct permute_param* param = (struct permute_param*)ir_node->op.param_mem;
     if ((param->order0 == 0) && (param->order1 == 2) && (param->order2 == 3) && (param->order3 == 1))
     {
-        order_type = 3; 
+        order_type = 3;
     }
     else if ((param->order0 == 1) && (param->order1 == 0) && (param->order2 == 2) && input->dim_num == 3)
     {
@@ -106,24 +106,29 @@ Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     {
         order_type = 0;
     }
-    
 }
 
 int Permute_vulkan::create_pipeline(const Option& _opt)
 {
     Option opt = _opt;
-    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0);        // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
     const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
 
     int elempack = 1;
-    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                    : 1;
 
     int out_elempack = 1;
-    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
-    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
-    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4
+                                                                                                                    : 1;
+    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4
+                                                                                                                    : 1;
+    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4
+                                                                                                                    : 1;
 
     size_t elemsize;
     size_t out_elemsize;
@@ -162,16 +167,16 @@ int Permute_vulkan::create_pipeline(const Option& _opt)
 
     std::vector<vk_specialization_type> specializations(1 + 10);
     specializations[0].i = order_type;
-    specializations[1 + 0].i = 0;   // shape_packed.dims;
-    specializations[1 + 1].i = 0;   // shape_packed.w;
-    specializations[1 + 2].i = 0;   // shape_packed.h;
-    specializations[1 + 3].i = 0;   // shape_packed.c;
-    specializations[1 + 4].i = 0;   // shape_packed.cstep;
-    specializations[1 + 5].i = 0;   // out_shape_packed.dims;
-    specializations[1 + 6].i = 0;   // out_shape_packed.w;
-    specializations[1 + 7].i = 0;   // out_shape_packed.h;
-    specializations[1 + 8].i = 0;   // out_shape_packed.c;
-    specializations[1 + 9].i = 0;   // out_shape_packed.cstep;
+    specializations[1 + 0].i = 0; // shape_packed.dims;
+    specializations[1 + 1].i = 0; // shape_packed.w;
+    specializations[1 + 2].i = 0; // shape_packed.h;
+    specializations[1 + 3].i = 0; // shape_packed.c;
+    specializations[1 + 4].i = 0; // shape_packed.cstep;
+    specializations[1 + 5].i = 0; // out_shape_packed.dims;
+    specializations[1 + 6].i = 0; // out_shape_packed.w;
+    specializations[1 + 7].i = 0; // out_shape_packed.h;
+    specializations[1 + 8].i = 0; // out_shape_packed.c;
+    specializations[1 + 9].i = 0; // out_shape_packed.cstep;
 
     Tensor local_size_xyz_bottom; // pack4to1 and pack8to1
     if (shape_packed.dims == 2)
@@ -342,7 +347,8 @@ int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
             outh = w;
         }
 
-        out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
+        out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4
+                                                                                 : 1;
         out_elemsize = elemsize / elempack * out_elempack;
 
         if (opt.use_fp16_packed && !opt.use_fp16_storage)
@@ -401,7 +407,8 @@ int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
             outc = w;
         }
 
-        out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
+        out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4
+                                                                                 : 1;
         out_elemsize = elemsize / elempack * out_elempack;
 
         if (opt.use_fp16_packed && !opt.use_fp16_storage)
@@ -472,4 +479,4 @@ int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     return 0;
 }
 
-}   // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/permute_vulkan.hpp b/source/device/vulkan/layer/permute_vulkan.hpp
index 5ea17c635..2a6763c13 100644
--- a/source/device/vulkan/layer/permute_vulkan.hpp
+++ b/source/device/vulkan/layer/permute_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "permute_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class Permute_vulkan : public Layer
 {
@@ -55,7 +55,7 @@ class Permute_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -79,6 +79,6 @@ class Permute_vulkan : public Layer
     int order_type;
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/pooling_vulkan.cpp b/source/device/vulkan/layer/pooling_vulkan.cpp
index eb50b1704..8f4234367 100644
--- a/source/device/vulkan/layer/pooling_vulkan.cpp
+++ b/source/device/vulkan/layer/pooling_vulkan.cpp
@@ -51,7 +51,6 @@ Pooling_vulkan::Pooling_vulkan()
     pipeline_pooling_global = 0;
     pipeline_pooling_global_pack4 = 0;
     pipeline_pooling_global_pack8 = 0;
-
 }
 
 Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
@@ -67,28 +66,28 @@ Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
     // Tensor* output_tensor = t_node->GetOutputTensor(0);
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
-    struct pool_param *param_ = (struct pool_param *)ir_node->op.param_mem;
+    struct pool_param* param_ = (struct pool_param*)ir_node->op.param_mem;
 
-    pooling_type = param_->pool_method;     // 0:max    1:avg
+    pooling_type = param_->pool_method; // 0:max    1:avg
     kernel_h = param_->kernel_h;
     kernel_w = param_->kernel_w;
     stride_h = param_->stride_h;
     stride_w = param_->stride_w;
     global = param_->global;
     caffe_flavor = param_->caffe_flavor;
-    pad_h0 = param_->pad_h0;  
-    pad_w0 = param_->pad_w0;  
-    pad_h1 = param_->pad_h1;  
-    pad_w1 = param_->pad_w1;  
+    pad_h0 = param_->pad_h0;
+    pad_w0 = param_->pad_w0;
+    pad_h1 = param_->pad_h1;
+    pad_w1 = param_->pad_w1;
     input_c = input->dims[1];
     input_h = input->dims[2];
     input_w = input->dims[3];
@@ -98,11 +97,12 @@ Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     // printf("create pooling layer with param:%d %d %d %d %d %d %d %d %d %d\n", kernel_h, kernel_w, stride_h, stride_w, global, pad_h0, pad_h1, pad_w0, pad_w1, param_->alg);
 }
 
-
 int Pooling_vulkan::create_pipeline(const Option& opt)
 {
-    int elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 : 1;
-    int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1;
+    int elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4
+                                                                                   : 1;
+    int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4
+                                                                                         : 1;
 
     size_t elemsize;
     size_t out_elemsize;
@@ -121,7 +121,7 @@ int Pooling_vulkan::create_pipeline(const Option& opt)
         elemsize = elempack * 4u;
         out_elemsize = out_elempack * 4u;
     }
-    
+
     {
         padding = new Padding_vulkan();
         padding->vkdev = vkdev;
@@ -143,7 +143,7 @@ int Pooling_vulkan::create_pipeline(const Option& opt)
         padding->create_pipeline(opt);
     }
 
-    if(global)
+    if (global)
     {
         std::vector<vk_specialization_type> specializations(1 + 10);
         specializations[0].i = pooling_type;
@@ -203,18 +203,18 @@ int Pooling_vulkan::create_pipeline(const Option& opt)
         specializations[7].i = pad_h0;
         specializations[8].i = pad_h1;
         specializations[9].i = global;
-        specializations[10].i = 0; // pad_mode;
-        specializations[11].i = 0; // avgpool_count_include_pad;
-        specializations[12 + 0].i = 0;  // 3; // shape_bordered_packed.dims;
-        specializations[12 + 1].i = 0;  // input_w; // shape_bordered_packed.w;
-        specializations[12 + 2].i = 0;  // input_h; // shape_bordered_packed.h;
-        specializations[12 + 3].i = 0;  // input_c; // shape_bordered_packed.c;
-        specializations[12 + 4].i = 0;  // input_w * input_h; // shape_bordered_packed.cstep;
-        specializations[12 + 5].i = 0;  // 3; // out_shape_packed.dims;
-        specializations[12 + 6].i = 0;  // output_w; // out_shape_packed.w;
-        specializations[12 + 7].i = 0;  // output_h; // out_shape_packed.h;
-        specializations[12 + 8].i = 0;  // output_c; // out_shape_packed.c;
-        specializations[12 + 9].i = 0;  // output_h * output_c; // out_shape_packed.cstep;
+        specializations[10].i = 0;     // pad_mode;
+        specializations[11].i = 0;     // avgpool_count_include_pad;
+        specializations[12 + 0].i = 0; // 3; // shape_bordered_packed.dims;
+        specializations[12 + 1].i = 0; // input_w; // shape_bordered_packed.w;
+        specializations[12 + 2].i = 0; // input_h; // shape_bordered_packed.h;
+        specializations[12 + 3].i = 0; // input_c; // shape_bordered_packed.c;
+        specializations[12 + 4].i = 0; // input_w * input_h; // shape_bordered_packed.cstep;
+        specializations[12 + 5].i = 0; // 3; // out_shape_packed.dims;
+        specializations[12 + 6].i = 0; // output_w; // out_shape_packed.w;
+        specializations[12 + 7].i = 0; // output_h; // out_shape_packed.h;
+        specializations[12 + 8].i = 0; // output_c; // out_shape_packed.c;
+        specializations[12 + 9].i = 0; // output_h * output_c; // out_shape_packed.cstep;
 
         VkTensor local_size_xyz;
         local_size_xyz.w = std::min(4, output_w);
@@ -262,10 +262,10 @@ int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     size_t elemsize = bottom_blob.elemsize;
     int elempack = bottom_blob.elempack;
 
-    if(global)
+    if (global)
     {
         // printf("input shape: %d %d %d, out shape: %d %d %d\n", input_c, input_h, input_w, output_c, output_h, output_w);
-        top_blob.create(output_c/elempack, elemsize, elempack, opt.blob_vkallocator);
+        top_blob.create(output_c / elempack, elemsize, elempack, opt.blob_vkallocator);
         if (top_blob.empty())
             return -100;
         // printf("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w);
@@ -285,9 +285,9 @@ int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
         constants[8].i = top_blob.c;
         constants[9].i = top_blob.cstep;
 
-        const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8
+        const Pipeline* pipeline = elempack == 8   ? pipeline_pooling_global_pack8
                                    : elempack == 4 ? pipeline_pooling_global_pack4
-                                   : pipeline_pooling_global;
+                                                   : pipeline_pooling_global;
 
         cmd.record_pipeline(pipeline, bindings, constants, top_blob);
 
@@ -306,8 +306,7 @@ int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
         padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
     }
 
-    top_blob.create(output_w, output_h, output_c/elempack, elemsize, elempack, opt.blob_vkallocator);
-
+    top_blob.create(output_w, output_h, output_c / elempack, elemsize, elempack, opt.blob_vkallocator);
 
     std::vector<VkTensor> bindings(2);
     bindings[0] = bottom_blob_bordered;
@@ -327,9 +326,9 @@ int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     constants[10].i = 0;
     constants[11].i = 0;
 
-    const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_pack8
+    const Pipeline* pipeline = elempack == 8   ? pipeline_pooling_pack8
                                : elempack == 4 ? pipeline_pooling_pack4
-                               : pipeline_pooling;
+                                               : pipeline_pooling;
 
     cmd.record_pipeline(pipeline, bindings, constants, top_blob);
     return 0;
diff --git a/source/device/vulkan/layer/pooling_vulkan.hpp b/source/device/vulkan/layer/pooling_vulkan.hpp
index e4a823e9e..33be747b2 100644
--- a/source/device/vulkan/layer/pooling_vulkan.hpp
+++ b/source/device/vulkan/layer/pooling_vulkan.hpp
@@ -56,21 +56,21 @@ class Pooling_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
-    int pooling_type;   // // 0:max    1:avg
-    int kernel_h; // = param_->kernel_h;
-    int kernel_w; // = param_->kernel_w;
-    int stride_h; // = param_->stride_h;
-    int stride_w; // = param_->stride_w;
-    int global; // = param_->global;
+    int pooling_type; // // 0:max    1:avg
+    int kernel_h;     // = param_->kernel_h;
+    int kernel_w;     // = param_->kernel_w;
+    int stride_h;     // = param_->stride_h;
+    int stride_w;     // = param_->stride_w;
+    int global;       // = param_->global;
     int caffe_flavor; // = param_->caffe_flavor;
-    int pad_h0; // = param_->pad_h0;  
-    int pad_w0; // = param_->pad_w0;  
-    int pad_h1; // = param_->pad_h1;  
-    int pad_w1; // = param_->pad_w1;  
+    int pad_h0;       // = param_->pad_h0;
+    int pad_w0;       // = param_->pad_w0;
+    int pad_h1;       // = param_->pad_h1;
+    int pad_w1;       // = param_->pad_w1;
     int input_c;
     int input_h;
     int input_w;
@@ -91,5 +91,4 @@ class Pooling_vulkan : public Layer
 
 } // namespace TEngine
 
-
 #endif
diff --git a/source/device/vulkan/layer/priorbox_vulkan.cpp b/source/device/vulkan/layer/priorbox_vulkan.cpp
index de81aec7a..23198f4e8 100644
--- a/source/device/vulkan/layer/priorbox_vulkan.cpp
+++ b/source/device/vulkan/layer/priorbox_vulkan.cpp
@@ -60,28 +60,28 @@ PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    for(int i = 0; i < ir_node->input_num; i++)
+    for (int i = 0; i < ir_node->input_num; i++)
     {
-        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]);
         std::string name = input->name;
         bottoms.push_back(name);
     }
 
-    for(int i = 0; i < ir_node->output_num; i++)
+    for (int i = 0; i < ir_node->output_num; i++)
     {
-        struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]);
         std::string name = output->name;
         tops.push_back(name);
     }
 
     // params
-    struct tensor *featmap_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
-    struct tensor *data_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
-    struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    input_c = data_tensor->dims[1];   // param->input_channel;
+    struct tensor* featmap_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* data_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    input_c = data_tensor->dims[1]; // param->input_channel;
     input_h = data_tensor->dims[2];
     input_w = data_tensor->dims[3];
-    output_c = output_tensor->dims[1];  // param->output_channel;
+    output_c = output_tensor->dims[1]; // param->output_channel;
     output_h = output_tensor->dims[2];
     output_w = output_tensor->dims[3];
 
@@ -90,8 +90,8 @@ PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     const int feat_height = featmap_tensor->dims[2];
     const int feat_width = featmap_tensor->dims[3];
 
-    struct priorbox_param *param = (struct priorbox_param *)ir_node->op.param_mem;
-    
+    struct priorbox_param* param = (struct priorbox_param*)ir_node->op.param_mem;
+
     variances[0] = (param->variance)[0];
     variances[1] = (param->variance)[1];
     variances[2] = (param->variance)[2];
@@ -112,8 +112,8 @@ PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
 
     if (param->step_h == 0 || param->step_w == 0)
     {
-        step_width = ( float )(image_width) / feat_width;
-        step_height = ( float )(image_height) / feat_height;
+        step_width = (float)(image_width) / feat_width;
+        step_height = (float)(image_height) / feat_height;
     }
     else
     {
@@ -137,9 +137,12 @@ int PriorBox_vulkan::create_pipeline(const Option& opt)
     const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
 
     int elempack = 1;
-    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                    : 1;
 
     size_t elemsize;
     if (opt.use_fp16_storage)
@@ -182,8 +185,8 @@ int PriorBox_vulkan::create_pipeline(const Option& opt)
         specializations[8].i = num_max_size;
         specializations[9].i = num_aspect_ratio;
         specializations[10].i = num_prior;
-        specializations[11 + 0].i = 0;//shape_packed.w;
-        specializations[11 + 1].i = 0;//shape_packed.h;
+        specializations[11 + 0].i = 0; //shape_packed.w;
+        specializations[11 + 1].i = 0; //shape_packed.h;
 
         pipeline_priorbox = new Pipeline(vkdev);
         pipeline_priorbox->set_optimal_local_size_xyz();
@@ -348,4 +351,4 @@ int PriorBox_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs,
     return 0;
 }
 
-}   // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/priorbox_vulkan.hpp b/source/device/vulkan/layer/priorbox_vulkan.hpp
index 69b8f8bb7..3ae12f99e 100644
--- a/source/device/vulkan/layer/priorbox_vulkan.hpp
+++ b/source/device/vulkan/layer/priorbox_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "priorbox_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class PriorBox_vulkan : public Layer
 {
@@ -56,7 +56,7 @@ class PriorBox_vulkan : public Layer
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
     virtual int upload_model(VkTransfer& cmd, const Option& opt);
-    
+
     virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -91,6 +91,6 @@ class PriorBox_vulkan : public Layer
     VkTensor aspect_ratios_gpu;
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/relu_vulkan.cpp b/source/device/vulkan/layer/relu_vulkan.cpp
index f541806cf..510d4245b 100644
--- a/source/device/vulkan/layer/relu_vulkan.cpp
+++ b/source/device/vulkan/layer/relu_vulkan.cpp
@@ -64,23 +64,23 @@ ReLU_vulkan::ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
     // params
-    input_c = input->dims[1];   // param->input_channel;
+    input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
-    output_c = output->dims[1];  // param->output_channel;
+    output_c = output->dims[1]; // param->output_channel;
     output_h = output->dims[2];
     output_w = output->dims[3];
 
-    struct relu_param *param = (struct relu_param *)ir_node->op.param_mem;
+    struct relu_param* param = (struct relu_param*)ir_node->op.param_mem;
     negative_slope = param->negative_slope;
 }
 
@@ -89,9 +89,12 @@ int ReLU_vulkan::create_pipeline(const Option& opt)
     const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
 
     int elempack = 1;
-    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                    : 1;
 
     size_t elemsize;
     if (opt.use_fp16_storage)
@@ -113,12 +116,12 @@ int ReLU_vulkan::create_pipeline(const Option& opt)
     if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
 
     std::vector<vk_specialization_type> specializations(1 + 5);
-    specializations[0].f = negative_slope;  // slope;
-    specializations[1 + 0].i = 0;   // shape_packed.dims;
-    specializations[1 + 1].i = 0;   // shape_packed.w;
-    specializations[1 + 2].i = 0;   // shape_packed.h;
-    specializations[1 + 3].i = 0;   // shape_packed.c;
-    specializations[1 + 4].i = 0;   // shape_packed.cstep;
+    specializations[0].f = negative_slope; // slope;
+    specializations[1 + 0].i = 0;          // shape_packed.dims;
+    specializations[1 + 1].i = 0;          // shape_packed.w;
+    specializations[1 + 2].i = 0;          // shape_packed.h;
+    specializations[1 + 3].i = 0;          // shape_packed.c;
+    specializations[1 + 4].i = 0;          // shape_packed.cstep;
 
     Tensor local_size_xyz;
     if (shape_packed.dims == 1)
@@ -167,7 +170,6 @@ int ReLU_vulkan::create_pipeline(const Option& opt)
     return 0;
 }
 
-
 int ReLU_vulkan::destroy_pipeline(const Option& /*opt*/)
 {
     delete pipeline_relu;
@@ -196,9 +198,9 @@ int ReLU_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, cons
     constants[3].i = bottom_top_blob.c;
     constants[4].i = bottom_top_blob.cstep;
 
-    const Pipeline* pipeline = elempack == 8 ? pipeline_relu_pack8
+    const Pipeline* pipeline = elempack == 8   ? pipeline_relu_pack8
                                : elempack == 4 ? pipeline_relu_pack4
-                               : pipeline_relu;
+                                               : pipeline_relu;
 
     cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
 
@@ -211,4 +213,4 @@ int ReLU_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob
     return 0;
 }
 
-}
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/relu_vulkan.hpp b/source/device/vulkan/layer/relu_vulkan.hpp
index c928a756f..c707481c8 100644
--- a/source/device/vulkan/layer/relu_vulkan.hpp
+++ b/source/device/vulkan/layer/relu_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "relu_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class ReLU_vulkan : public Layer
 {
@@ -55,7 +55,7 @@ class ReLU_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
     virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
 
@@ -74,6 +74,6 @@ class ReLU_vulkan : public Layer
     float negative_slope;
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/reshape_vulkan.cpp b/source/device/vulkan/layer/reshape_vulkan.cpp
index 7e36dca8f..3f12e241f 100644
--- a/source/device/vulkan/layer/reshape_vulkan.cpp
+++ b/source/device/vulkan/layer/reshape_vulkan.cpp
@@ -86,59 +86,56 @@ Reshape_vulkan::Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
     // params
-    input_c = input->dims[1];   // param->input_channel;
+    input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
 
-    struct reshape_param *param = (struct reshape_param *)ir_node->op.param_mem;
+    struct reshape_param* param = (struct reshape_param*)ir_node->op.param_mem;
 
     ndim = param->dim_size;
     permute = param->reverse;
-    // TODO fix 
+    // TODO fix
     // c = param->re_shape[0];
     // w = param->re_shape[1];
     // h = param->re_shape[2];
-    if(param->dim_size == 4)
+    if (param->dim_size == 4)
     {
         ndim = 3;
-        output_c = output->dims[1];  // param->output_channel;
+        output_c = output->dims[1]; // param->output_channel;
         output_h = output->dims[2];
         output_w = output->dims[3];
 
-        c = output->dims[1];  // param->output_channel;
+        c = output->dims[1]; // param->output_channel;
         h = output->dims[2];
         w = output->dims[3];
     }
     else
     {
         ndim = param->dim_size;
-        
-        output_c = output->dims[0];  // param->output_channel;
+
+        output_c = output->dims[0]; // param->output_channel;
         output_h = output->dims[1];
         output_w = output->dims[2];
 
-        c = output_c;  // param->output_channel;
+        c = output_c; // param->output_channel;
         h = output_h;
         w = output_w;
     }
-
-    
-
 }
 
 int Reshape_vulkan::create_pipeline(const Option& _opt)
 {
     Option opt = _opt;
-    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0);        // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
     const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
 
     bool need_permute = permute == 1;
@@ -161,14 +158,20 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
     }
 
     int elempack = 1;
-    if (shape_permuted.dims == 1) elempack = opt.use_shader_pack8 && shape_permuted.w % 8 == 0 ? 8 : shape_permuted.w % 4 == 0 ? 4 : 1;
-    if (shape_permuted.dims == 2) elempack = opt.use_shader_pack8 && shape_permuted.h % 8 == 0 ? 8 : shape_permuted.h % 4 == 0 ? 4 : 1;
-    if (shape_permuted.dims == 3) elempack = opt.use_shader_pack8 && shape_permuted.c % 8 == 0 ? 8 : shape_permuted.c % 4 == 0 ? 4 : 1;
+    if (shape_permuted.dims == 1) elempack = opt.use_shader_pack8 && shape_permuted.w % 8 == 0 ? 8 : shape_permuted.w % 4 == 0 ? 4
+                                                                                                                               : 1;
+    if (shape_permuted.dims == 2) elempack = opt.use_shader_pack8 && shape_permuted.h % 8 == 0 ? 8 : shape_permuted.h % 4 == 0 ? 4
+                                                                                                                               : 1;
+    if (shape_permuted.dims == 3) elempack = opt.use_shader_pack8 && shape_permuted.c % 8 == 0 ? 8 : shape_permuted.c % 4 == 0 ? 4
+                                                                                                                               : 1;
 
     int out_elempack = 1;
-    if (out_shape_permuted.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape_permuted.w % 8 == 0 ? 8 : out_shape_permuted.w % 4 == 0 ? 4 : 1;
-    if (out_shape_permuted.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape_permuted.h % 8 == 0 ? 8 : out_shape_permuted.h % 4 == 0 ? 4 : 1;
-    if (out_shape_permuted.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape_permuted.c % 8 == 0 ? 8 : out_shape_permuted.c % 4 == 0 ? 4 : 1;
+    if (out_shape_permuted.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape_permuted.w % 8 == 0 ? 8 : out_shape_permuted.w % 4 == 0 ? 4
+                                                                                                                                               : 1;
+    if (out_shape_permuted.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape_permuted.h % 8 == 0 ? 8 : out_shape_permuted.h % 4 == 0 ? 4
+                                                                                                                                               : 1;
+    if (out_shape_permuted.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape_permuted.c % 8 == 0 ? 8 : out_shape_permuted.c % 4 == 0 ? 4
+                                                                                                                                               : 1;
 
     size_t elemsize;
     size_t out_elemsize;
@@ -204,19 +207,19 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
         support_image_storage = false;
         opt.use_image_storage = false;
     }
-    
+
     std::vector<vk_specialization_type> specializations(1 + 10);
     specializations[0].i = ndim;
-    specializations[1 + 0].i = 0;   // shape_packed.dims;
-    specializations[1 + 1].i = 0;   // shape_packed.w;
-    specializations[1 + 2].i = 0;   // shape_packed.h;
-    specializations[1 + 3].i = 0;   // shape_packed.c;
-    specializations[1 + 4].i = 0;   // shape_packed.cstep;
-    specializations[1 + 5].i = 0;   // out_shape_packed.dims;
-    specializations[1 + 6].i = 0;   // out_shape_packed.w;
-    specializations[1 + 7].i = 0;   // out_shape_packed.h;
-    specializations[1 + 8].i = 0;   // out_shape_packed.c;
-    specializations[1 + 9].i = 0;   // out_shape_packed.cstep;
+    specializations[1 + 0].i = 0; // shape_packed.dims;
+    specializations[1 + 1].i = 0; // shape_packed.w;
+    specializations[1 + 2].i = 0; // shape_packed.h;
+    specializations[1 + 3].i = 0; // shape_packed.c;
+    specializations[1 + 4].i = 0; // shape_packed.cstep;
+    specializations[1 + 5].i = 0; // out_shape_packed.dims;
+    specializations[1 + 6].i = 0; // out_shape_packed.w;
+    specializations[1 + 7].i = 0; // out_shape_packed.h;
+    specializations[1 + 8].i = 0; // out_shape_packed.c;
+    specializations[1 + 9].i = 0; // out_shape_packed.cstep;
 
     Tensor local_size_xyz_bottom; // pack4to1 and pack8to1
     if (shape_packed.dims == 1)
@@ -415,7 +418,8 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
         if (outw == -1)
             outw = total;
 
-        out_elempack = opt.use_shader_pack8 && outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1;
+        out_elempack = opt.use_shader_pack8 && outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4
+                                                                                 : 1;
 
         if (dims == 1 && bottom_blob.w == outw && elempack == out_elempack)
         {
@@ -435,7 +439,8 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
         if (outh == -1)
             outh = total / outw;
 
-        out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
+        out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4
+                                                                                 : 1;
 
         if (dims == 2 && bottom_blob.h == outh && elempack == out_elempack)
         {
@@ -460,7 +465,8 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
         if (outc == -1)
             outc = total / outh / outw;
 
-        out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
+        out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4
+                                                                                 : 1;
 
         if (dims == 3 && bottom_blob.c == outc && elempack == out_elempack)
         {
@@ -576,5 +582,4 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     return 0;
 }
 
-
-}   // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/reshape_vulkan.hpp b/source/device/vulkan/layer/reshape_vulkan.hpp
index 33bc2be41..1d52e48a8 100644
--- a/source/device/vulkan/layer/reshape_vulkan.hpp
+++ b/source/device/vulkan/layer/reshape_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "reshape_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class Reshape_vulkan : public Layer
 {
@@ -55,7 +55,7 @@ class Reshape_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -90,9 +90,8 @@ class Reshape_vulkan : public Layer
     int permute;
 
     int ndim;
-
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/softmax_vulkan.cpp b/source/device/vulkan/layer/softmax_vulkan.cpp
index 970e03295..8ee653505 100644
--- a/source/device/vulkan/layer/softmax_vulkan.cpp
+++ b/source/device/vulkan/layer/softmax_vulkan.cpp
@@ -86,24 +86,24 @@ Softmax_vulkan::Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     graph = ir_graph;
     node = ir_node;
 
-    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     std::string name = input->name;
     bottoms.push_back(name);
 
-    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     name = output->name;
     tops.push_back(name);
 
     // params
-    input_c = input->dims[1];   // param->input_channel;
+    input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
-    output_c = output->dims[1];  // param->output_channel;
+    output_c = output->dims[1]; // param->output_channel;
     output_h = output->dims[2];
     output_w = output->dims[3];
-    
-    struct softmax_param *param = (struct softmax_param *)ir_node->op.param_mem;
-    axis = param->axis-1;
+
+    struct softmax_param* param = (struct softmax_param*)ir_node->op.param_mem;
+    axis = param->axis - 1;
 }
 
 int Softmax_vulkan::create_pipeline(const Option& opt)
@@ -111,9 +111,12 @@ int Softmax_vulkan::create_pipeline(const Option& opt)
     const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
 
     int elempack = 1;
-    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
-    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
-    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4
+                                                                                                    : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4
+                                                                                                    : 1;
 
     size_t elemsize;
     if (opt.use_fp16_storage)
@@ -162,16 +165,16 @@ int Softmax_vulkan::create_pipeline(const Option& opt)
 
     std::vector<vk_specialization_type> specializations(1 + 10);
     specializations[0].i = axis;
-    specializations[1 + 0].i = 0;   // shape_packed.dims;
-    specializations[1 + 1].i = 0;   // shape_packed.w;
-    specializations[1 + 2].i = 0;   // shape_packed.h;
-    specializations[1 + 3].i = 0;   // shape_packed.c;
-    specializations[1 + 4].i = 0;   // shape_packed.cstep;
-    specializations[1 + 5].i = 0;   // workspace_shape_packed.dims;
-    specializations[1 + 6].i = 0;   // workspace_shape_packed.w;
-    specializations[1 + 7].i = 0;   // workspace_shape_packed.h;
-    specializations[1 + 8].i = 0;   // workspace_shape_packed.c;
-    specializations[1 + 9].i = 0;   // workspace_shape_packed.cstep;
+    specializations[1 + 0].i = 0; // shape_packed.dims;
+    specializations[1 + 1].i = 0; // shape_packed.w;
+    specializations[1 + 2].i = 0; // shape_packed.h;
+    specializations[1 + 3].i = 0; // shape_packed.c;
+    specializations[1 + 4].i = 0; // shape_packed.cstep;
+    specializations[1 + 5].i = 0; // workspace_shape_packed.dims;
+    specializations[1 + 6].i = 0; // workspace_shape_packed.w;
+    specializations[1 + 7].i = 0; // workspace_shape_packed.h;
+    specializations[1 + 8].i = 0; // workspace_shape_packed.c;
+    specializations[1 + 9].i = 0; // workspace_shape_packed.cstep;
 
     {
         Tensor local_size_xyz;
@@ -294,7 +297,6 @@ int Softmax_vulkan::create_pipeline(const Option& opt)
     return 0;
 }
 
-
 int Softmax_vulkan::destroy_pipeline(const Option& /*opt*/)
 {
     delete pipeline_softmax_reduce_max;
@@ -397,9 +399,9 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c
         constants[8].i = max_workspace.c;
         constants[9].i = max_workspace.cstep;
 
-        const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_max_pack8
+        const Pipeline* pipeline = elempack == 8   ? pipeline_softmax_reduce_max_pack8
                                    : elempack == 4 ? pipeline_softmax_reduce_max_pack4
-                                   : pipeline_softmax_reduce_max;
+                                                   : pipeline_softmax_reduce_max;
 
         cmd.record_pipeline(pipeline, bindings, constants, max_workspace);
     }
@@ -422,9 +424,9 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c
         constants[8].i = max_workspace.c;
         constants[9].i = max_workspace.cstep;
 
-        const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_exp_sub_max_pack8
+        const Pipeline* pipeline = elempack == 8   ? pipeline_softmax_exp_sub_max_pack8
                                    : elempack == 4 ? pipeline_softmax_exp_sub_max_pack4
-                                   : pipeline_softmax_exp_sub_max;
+                                                   : pipeline_softmax_exp_sub_max;
 
         cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
     }
@@ -447,9 +449,9 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c
         constants[8].i = sum_workspace.c;
         constants[9].i = sum_workspace.cstep;
 
-        const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_sum_pack8
+        const Pipeline* pipeline = elempack == 8   ? pipeline_softmax_reduce_sum_pack8
                                    : elempack == 4 ? pipeline_softmax_reduce_sum_pack4
-                                   : pipeline_softmax_reduce_sum;
+                                                   : pipeline_softmax_reduce_sum;
 
         cmd.record_pipeline(pipeline, bindings, constants, sum_workspace);
     }
@@ -472,9 +474,9 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c
         constants[8].i = sum_workspace.c;
         constants[9].i = sum_workspace.cstep;
 
-        const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_div_sum_pack8
+        const Pipeline* pipeline = elempack == 8   ? pipeline_softmax_div_sum_pack8
                                    : elempack == 4 ? pipeline_softmax_div_sum_pack4
-                                   : pipeline_softmax_div_sum;
+                                                   : pipeline_softmax_div_sum;
 
         cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
     }
@@ -482,5 +484,4 @@ int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c
     return 0;
 }
 
-
-}   // namespace TEngine
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/softmax_vulkan.hpp b/source/device/vulkan/layer/softmax_vulkan.hpp
index 108ea5d62..94c1be27c 100644
--- a/source/device/vulkan/layer/softmax_vulkan.hpp
+++ b/source/device/vulkan/layer/softmax_vulkan.hpp
@@ -45,7 +45,7 @@
 
 #include "softmax_param.h"
 
-namespace TEngine{
+namespace TEngine {
 
 class Softmax_vulkan : public Layer
 {
@@ -55,7 +55,7 @@ class Softmax_vulkan : public Layer
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
-    
+
     virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
@@ -82,9 +82,8 @@ class Softmax_vulkan : public Layer
     int output_c;
     int output_h;
     int output_w;
-
 };
 
-}   // namespace TEngine
+} // namespace TEngine
 
 #endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer_shader_type.h b/source/device/vulkan/layer_shader_type.h
index e9c713062..2fc6d359c 100644
--- a/source/device/vulkan/layer_shader_type.h
+++ b/source/device/vulkan/layer_shader_type.h
@@ -47,7 +47,7 @@ enum LayerShaderType
 {
 #include "layer_shader_type_enum.h"
 };
-} // namespace LayerType
+} // namespace LayerShaderType
 
 } // namespace TEngine
 
diff --git a/source/device/vulkan/vulkan_allocator.cpp b/source/device/vulkan/vulkan_allocator.cpp
index c5483ca4f..b901923cd 100644
--- a/source/device/vulkan/vulkan_allocator.cpp
+++ b/source/device/vulkan/vulkan_allocator.cpp
@@ -48,10 +48,10 @@ namespace TEngine {
 
 Allocator::~Allocator()
 {
-
 }
 
-VkAllocator::VkAllocator(const GPUDevice* _vkdev) : vkdev(_vkdev)
+VkAllocator::VkAllocator(const GPUDevice* _vkdev)
+    : vkdev(_vkdev)
 {
     buffer_memory_type_index = (uint32_t)-1;
     image_memory_type_index = (uint32_t)-1;
@@ -258,7 +258,8 @@ VkImageView VkAllocator::create_imageview(VkImageViewType type, VkImage image, V
     return imageview;
 }
 
-VkBlobAllocator::VkBlobAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev)
+VkBlobAllocator::VkBlobAllocator(const GPUDevice* _vkdev)
+    : VkAllocator(_vkdev)
 {
     buffer_offset_alignment = vkdev->info.buffer_offset_alignment;
     bind_memory_offset_alignment = vkdev->info.buffer_image_granularity;
@@ -273,7 +274,7 @@ VkBlobAllocator::VkBlobAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev)
         buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size);
     }
 
-    block_size = alignSize(16 * 1024 * 1024, buffer_offset_alignment);// 16M
+    block_size = alignSize(16 * 1024 * 1024, buffer_offset_alignment); // 16M
 }
 
 VkBlobAllocator::~VkBlobAllocator()
@@ -284,18 +285,18 @@ VkBlobAllocator::~VkBlobAllocator()
 // TODO
 void VkBlobAllocator::clear()
 {
-//     TLOG_INFO("VkBlobAllocator %lu", buffer_blocks.size());
+    //     TLOG_INFO("VkBlobAllocator %lu", buffer_blocks.size());
 
-    for (size_t i=0; i<buffer_blocks.size(); i++)
+    for (size_t i = 0; i < buffer_blocks.size(); i++)
     {
         VkBufferMemory* ptr = buffer_blocks[i];
 
-//         std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
-//         while (it != buffer_budgets[i].end())
-//         {
-//             TLOG_INFO("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second);
-//             it++;
-//         }
+        //         std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
+        //         while (it != buffer_budgets[i].end())
+        //         {
+        //             TLOG_INFO("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second);
+        //             it++;
+        //         }
 
         if (mappable)
             vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
@@ -309,16 +310,16 @@ void VkBlobAllocator::clear()
 
     buffer_budgets.clear();
 
-    for (size_t i=0; i<image_memory_blocks.size(); i++)
+    for (size_t i = 0; i < image_memory_blocks.size(); i++)
     {
         VkDeviceMemory memory = image_memory_blocks[i];
 
-//         std::list< std::pair<size_t, size_t> >::iterator it = image_memory_budgets[i].begin();
-//         while (it != image_memory_budgets[i].end())
-//         {
-//             TLOG_INFO("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second);
-//             it++;
-//         }
+        //         std::list< std::pair<size_t, size_t> >::iterator it = image_memory_budgets[i].begin();
+        //         while (it != image_memory_budgets[i].end())
+        //         {
+        //             TLOG_INFO("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second);
+        //             it++;
+        //         }
 
         vkFreeMemory(vkdev->vkdevice(), memory, 0);
     }
@@ -334,9 +335,9 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
     const int buffer_block_count = buffer_blocks.size();
 
     // find first spare space in buffer_blocks
-    for (int i=0; i<buffer_block_count; i++)
+    for (int i = 0; i < buffer_block_count; i++)
     {
-        std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
+        std::list<std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
         while (it != buffer_budgets[i].end())
         {
             size_t budget_size = it->second;
@@ -430,7 +431,7 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
 
     // adjust buffer_budgets
-    std::list< std::pair<size_t, size_t> > budget;
+    std::list<std::pair<size_t, size_t> > budget;
     if (new_block_size > aligned_size)
     {
         budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
@@ -440,7 +441,6 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
     //     TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
 
     return ptr;
-
 }
 
 VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack)
@@ -536,9 +536,9 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t
     const int image_memory_block_count = image_memory_blocks.size();
 
     // find first spare space in image_memory_blocks
-    for (int i=0; i<image_memory_block_count; i++)
+    for (int i = 0; i < image_memory_block_count; i++)
     {
-        std::list< std::pair<size_t, size_t> >::iterator it = image_memory_budgets[i].begin();
+        std::list<std::pair<size_t, size_t> >::iterator it = image_memory_budgets[i].begin();
         while (it != image_memory_budgets[i].end())
         {
             // we cannot use it->first directly for base offset alignment
@@ -589,7 +589,7 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t
                 it->second -= aligned_size;
             }
 
-//             TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
+            //             TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
 
             return ptr;
         }
@@ -636,27 +636,26 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t
     // adjust image_memory_budgets
     image_memory_blocks.push_back(ptr->memory);
 
-    std::list< std::pair<size_t, size_t> > budget;
+    std::list<std::pair<size_t, size_t> > budget;
     if (new_block_size > aligned_size)
     {
         budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
     }
     image_memory_budgets.push_back(budget);
 
-//     TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
+    //     TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
 
     return ptr;
 }
 
-
 void VkBlobAllocator::fastFree(VkBufferMemory* ptr)
 {
-//     TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
+    //     TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
 
     const int buffer_block_count = buffer_blocks.size();
 
     int block_index = -1;
-    for (int i=0; i<buffer_block_count; i++)
+    for (int i = 0; i < buffer_block_count; i++)
     {
         if (buffer_blocks[i]->buffer == ptr->buffer && buffer_blocks[i]->memory == ptr->memory)
         {
@@ -675,10 +674,10 @@ void VkBlobAllocator::fastFree(VkBufferMemory* ptr)
     }
 
     // merge
-    std::list< std::pair<size_t, size_t> >::iterator it_merge_left = buffer_budgets[block_index].end();
-    std::list< std::pair<size_t, size_t> >::iterator it_merge_right = buffer_budgets[block_index].end();
-    std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[block_index].begin();
-    for ( ; it != buffer_budgets[block_index].end(); it++)
+    std::list<std::pair<size_t, size_t> >::iterator it_merge_left = buffer_budgets[block_index].end();
+    std::list<std::pair<size_t, size_t> >::iterator it_merge_right = buffer_budgets[block_index].end();
+    std::list<std::pair<size_t, size_t> >::iterator it = buffer_budgets[block_index].begin();
+    for (; it != buffer_budgets[block_index].end(); it++)
     {
         if (it->first + it->second == ptr->offset)
         {
@@ -722,12 +721,12 @@ void VkBlobAllocator::fastFree(VkBufferMemory* ptr)
 
 void VkBlobAllocator::fastFree(VkImageMemory* ptr)
 {
-//     TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
+    //     TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
 
     const int image_memory_block_count = image_memory_blocks.size();
 
     int block_index = -1;
-    for (int i=0; i<image_memory_block_count; i++)
+    for (int i = 0; i < image_memory_block_count; i++)
     {
         if (image_memory_blocks[i] == ptr->memory)
         {
@@ -752,10 +751,10 @@ void VkBlobAllocator::fastFree(VkImageMemory* ptr)
     }
 
     // merge
-    std::list< std::pair<size_t, size_t> >::iterator it_merge_left = image_memory_budgets[block_index].end();
-    std::list< std::pair<size_t, size_t> >::iterator it_merge_right = image_memory_budgets[block_index].end();
-    std::list< std::pair<size_t, size_t> >::iterator it = image_memory_budgets[block_index].begin();
-    for ( ; it != image_memory_budgets[block_index].end(); it++)
+    std::list<std::pair<size_t, size_t> >::iterator it_merge_left = image_memory_budgets[block_index].end();
+    std::list<std::pair<size_t, size_t> >::iterator it_merge_right = image_memory_budgets[block_index].end();
+    std::list<std::pair<size_t, size_t> >::iterator it = image_memory_budgets[block_index].begin();
+    for (; it != image_memory_budgets[block_index].end(); it++)
     {
         if (it->first + it->second == ptr->bind_offset)
         {
@@ -803,7 +802,8 @@ void VkBlobAllocator::fastFree(VkImageMemory* ptr)
     }
 }
 
-VkWeightAllocator::VkWeightAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev)
+VkWeightAllocator::VkWeightAllocator(const GPUDevice* _vkdev)
+    : VkAllocator(_vkdev)
 {
     buffer_offset_alignment = vkdev->info.buffer_offset_alignment;
     bind_memory_offset_alignment = vkdev->info.buffer_image_granularity;
@@ -818,7 +818,7 @@ VkWeightAllocator::VkWeightAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkd
         buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size);
     }
 
-    block_size = alignSize(8 * 1024 * 1024, buffer_offset_alignment);// 8M
+    block_size = alignSize(8 * 1024 * 1024, buffer_offset_alignment); // 8M
 }
 
 VkWeightAllocator::~VkWeightAllocator()
@@ -827,7 +827,6 @@ VkWeightAllocator::~VkWeightAllocator()
     printf("run VkWeightAllocator descontruction function\n");
 }
 
-
 void VkWeightAllocator::clear()
 {
     printf("run VkWeightAllocator clear function\n");
@@ -842,9 +841,9 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
     const int buffer_block_count = buffer_blocks.size();
 
     // find first spare space in buffer_blocks
-    for (int i=0; i<buffer_block_count; i++)
+    for (int i = 0; i < buffer_block_count; i++)
     {
-	size_t free_size = buffer_block_free_spaces[i];
+        size_t free_size = buffer_block_free_spaces[i];
         if (free_size >= aligned_size)
         {
             size_t block_offset = block_size - free_size;
@@ -861,8 +860,8 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
 
             buffer_block_free_spaces[i] -= aligned_size;
 
-	    return ptr;
-	}
+            return ptr;
+        }
     }
     size_t new_block_size = std::max(block_size, aligned_size);
 
@@ -874,7 +873,7 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
 
     if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation)
     {
-	    VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2;
+        VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2;
         bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR;
         bufferMemoryRequirementsInfo2.pNext = 0;
         bufferMemoryRequirementsInfo2.buffer = block->buffer;
@@ -892,42 +891,42 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
 
         bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
 
-	if (dedicatedAllocation)
+        if (dedicatedAllocation)
         {
-	    // setup memory type and alignment
-	    if (buffer_memory_type_index == (uint32_t)-1)
+            // setup memory type and alignment
+            if (buffer_memory_type_index == (uint32_t)-1)
             {
-		if (vkdev->info.type == 1)
-		{
-		    // integrated gpu, prefer unified memory
-		    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
-		}
-		else
-		{
-		    // discrete gpu, device local
-		    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-		}
-
-		mappable = vkdev->is_mappable(buffer_memory_type_index);
+                if (vkdev->info.type == 1)
+                {
+                    // integrated gpu, prefer unified memory
+                    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+                }
+                else
+                {
+                    // discrete gpu, device local
+                    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+                }
+
+                mappable = vkdev->is_mappable(buffer_memory_type_index);
                 coherent = vkdev->is_coherent(buffer_memory_type_index);
-	    }
+            }
 
-	    block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer);
-	    // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
-	    vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
+            block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer);
+            // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
+            vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
 
-	    block->mapped_ptr = 0;
+            block->mapped_ptr = 0;
             if (mappable)
             {
                 vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
             }
 
-	    dedicated_buffer_blocks.push_back(block);
+            dedicated_buffer_blocks.push_back(block);
 
-	    // return sub buffer
+            // return sub buffer
             VkBufferMemory* ptr = new VkBufferMemory;
 
-	    ptr->buffer = block->buffer;
+            ptr->buffer = block->buffer;
             ptr->offset = 0;
             ptr->memory = block->memory;
             ptr->capacity = new_block_size;
@@ -936,7 +935,7 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
             ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
 
             return ptr;
-	}
+        }
     }
 
     VkMemoryRequirements memoryRequirements;
@@ -945,18 +944,18 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
     // setup memory type and alignment
     if (buffer_memory_type_index == (uint32_t)-1)
     {
-	if (vkdev->info.type == 1)
+        if (vkdev->info.type == 1)
         {
             // integrated gpu, prefer unified memory
-	    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
-	}
-	else
-	{
-	    // discrete gpu, device local
-	    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-	}
-
-	mappable = vkdev->is_mappable(buffer_memory_type_index);
+            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+        }
+        else
+        {
+            // discrete gpu, device local
+            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+        }
+
+        mappable = vkdev->is_mappable(buffer_memory_type_index);
         coherent = vkdev->is_coherent(buffer_memory_type_index);
     }
 
@@ -965,7 +964,7 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
     // ignore memoryRequirements.alignment as we always bind at zero offset
     vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
 
-//     printf("VkWeightAllocator M %p", block->buffer);
+    //     printf("VkWeightAllocator M %p", block->buffer);
     block->mapped_ptr = 0;
     if (mappable)
     {
@@ -1155,7 +1154,7 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int dims, int w, int h, int c, size
     const int image_memory_block_count = image_memory_blocks.size();
 
     // find first spare space in buffer_blocks
-    for (int i=0; i<image_memory_block_count; i++)
+    for (int i = 0; i < image_memory_block_count; i++)
     {
         // we cannot use image_memory_block_free_spaces[i] directly for base offset alignment
         size_t bind_base_offset = block_size - image_memory_block_free_spaces[i];
@@ -1241,17 +1240,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int dims, int w, int h, int c, size
     return ptr;
 }
 
-
 void VkWeightAllocator::fastFree(VkBufferMemory* ptr)
 {
-//     TLOG_INFO("VkWeightAllocator F %p", ptr->buffer);
+    //     TLOG_INFO("VkWeightAllocator F %p", ptr->buffer);
 
     delete ptr;
 }
 
 void VkWeightAllocator::fastFree(VkImageMemory* ptr)
 {
-//     TLOG_INFO("VkWeightAllocator F %p", ptr->memory);
+    //     TLOG_INFO("VkWeightAllocator F %p", ptr->memory);
 
     if (!ptr->command_refcount)
     {
@@ -1262,12 +1260,13 @@ void VkWeightAllocator::fastFree(VkImageMemory* ptr)
     }
 }
 
-VkStagingAllocator::VkStagingAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev)
+VkStagingAllocator::VkStagingAllocator(const GPUDevice* _vkdev)
+    : VkAllocator(_vkdev)
 {
     mappable = true;
     coherent = true;
 
-    size_compare_ratio = 192;// 0.75f * 256
+    size_compare_ratio = 192; // 0.75f * 256
 }
 
 VkStagingAllocator::~VkStagingAllocator()
@@ -1277,13 +1276,13 @@ VkStagingAllocator::~VkStagingAllocator()
 
 void VkStagingAllocator::clear()
 {
-//     TLOG_INFO("VkStagingAllocator %lu", buffer_budgets.size());
+    //     TLOG_INFO("VkStagingAllocator %lu", buffer_budgets.size());
 
     for (std::list<VkBufferMemory*>::iterator it = buffer_budgets.begin(); it != buffer_budgets.end(); it++)
     {
         VkBufferMemory* ptr = *it;
 
-//         TLOG_INFO("VkStagingAllocator F %p", ptr->buffer);
+        //         TLOG_INFO("VkStagingAllocator F %p", ptr->buffer);
 
         vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
         vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
@@ -1310,7 +1309,7 @@ VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size)
         {
             buffer_budgets.erase(it);
 
-//             TLOG_INFO("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity);
+            //             TLOG_INFO("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity);
 
             return ptr;
         }
@@ -1342,7 +1341,7 @@ VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size)
     ptr->access_flags = 0;
     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
 
-//     TLOG_INFO("VkStagingAllocator M %p %lu", ptr->buffer, size);
+    //     TLOG_INFO("VkStagingAllocator M %p %lu", ptr->buffer, size);
 
     return ptr;
 }
@@ -1394,14 +1393,14 @@ VkImageMemory* VkStagingAllocator::fastMalloc(int dims, int w, int h, int c, siz
     ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
     ptr->command_refcount = 0;
 
-//     TLOG_INFO("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format);
+    //     TLOG_INFO("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format);
 
     return ptr;
 }
 
 void VkStagingAllocator::fastFree(VkBufferMemory* ptr)
 {
-//     TLOG_INFO("VkStagingAllocator F %p", ptr->buffer);
+    //     TLOG_INFO("VkStagingAllocator F %p", ptr->buffer);
 
     // return to buffer_budgets
     buffer_budgets.push_back(ptr);
@@ -1409,14 +1408,15 @@ void VkStagingAllocator::fastFree(VkBufferMemory* ptr)
 
 void VkStagingAllocator::fastFree(VkImageMemory* ptr)
 {
-//     TLOG_INFO("VkStagingAllocator F %p", ptr->image);
+    //     TLOG_INFO("VkStagingAllocator F %p", ptr->image);
 
     free(ptr->mapped_ptr);
 
     delete ptr;
 }
 
-VkWeightStagingAllocator::VkWeightStagingAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev)
+VkWeightStagingAllocator::VkWeightStagingAllocator(const GPUDevice* _vkdev)
+    : VkAllocator(_vkdev)
 {
     mappable = true;
     coherent = true;
@@ -1455,14 +1455,14 @@ VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size)
     ptr->access_flags = 0;
     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
 
-//     printf("VkWeightStagingAllocator M %p %lu", ptr->buffer, size);
+    //     printf("VkWeightStagingAllocator M %p %lu", ptr->buffer, size);
 
     return ptr;
 }
 
 void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr)
 {
-//     TLOG_INFO("VkWeightStagingAllocator F %p", ptr->buffer);
+    //     TLOG_INFO("VkWeightStagingAllocator F %p", ptr->buffer);
 
     vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
     vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
diff --git a/source/device/vulkan/vulkan_allocator.hpp b/source/device/vulkan/vulkan_allocator.hpp
index 4a8f7e1c3..ffb0e4360 100644
--- a/source/device/vulkan/vulkan_allocator.hpp
+++ b/source/device/vulkan/vulkan_allocator.hpp
@@ -10,17 +10,18 @@
 #include "vulkan_platform.hpp"
 
 namespace TEngine {
-    
-#define MALLOC_ALIGN    16
 
-template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
+#define MALLOC_ALIGN 16
+
+template<typename _Tp>
+static inline _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
 {
-    return (_Tp*)(((size_t)ptr + n-1) & -n);
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
 }
 
 static inline size_t alignSize(size_t sz, int n)
 {
-    return (sz + n-1) & -n;
+    return (sz + n - 1) & -n;
 }
 
 static inline void* fastMalloc(size_t size)
@@ -42,8 +43,12 @@ static inline void fastFree(void* ptr)
     }
 }
 
-static inline int TENGINE_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
-
+static inline int TENGINE_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
 
 class Allocator
 {
@@ -158,8 +163,13 @@ class VkAllocator
 {
 public:
     VkAllocator(const GPUDevice* _vkdev);
-    virtual ~VkAllocator() { clear(); }
-    virtual void clear() {}
+    virtual ~VkAllocator()
+    {
+        clear();
+    }
+    virtual void clear()
+    {
+    }
 
     virtual VkBufferMemory* fastMalloc(size_t size) = 0;
     virtual void fastFree(VkBufferMemory* ptr) = 0;
@@ -198,16 +208,16 @@ class VkBlobAllocator : public VkAllocator
     virtual VkBufferMemory* fastMalloc(size_t size);
     virtual void fastFree(VkBufferMemory* ptr);
 
-    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; }
+    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); //{ return 0; }
     virtual void fastFree(VkImageMemory* ptr);
 
 protected:
     size_t block_size;
     size_t buffer_offset_alignment;
     size_t bind_memory_offset_alignment;
-    std::vector< std::list< std::pair<size_t, size_t> > > buffer_budgets;
+    std::vector<std::list<std::pair<size_t, size_t> > > buffer_budgets;
     std::vector<VkBufferMemory*> buffer_blocks;
-    std::vector< std::list< std::pair<size_t, size_t> > > image_memory_budgets;
+    std::vector<std::list<std::pair<size_t, size_t> > > image_memory_budgets;
     std::vector<VkDeviceMemory> image_memory_blocks;
 };
 
@@ -224,7 +234,7 @@ class VkWeightAllocator : public VkAllocator
 public:
     virtual VkBufferMemory* fastMalloc(size_t size);
     virtual void fastFree(VkBufferMemory* ptr);
-    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; }
+    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); //{ return 0; }
     virtual void fastFree(VkImageMemory* ptr);
 
 protected:
@@ -239,7 +249,6 @@ class VkWeightAllocator : public VkAllocator
     std::vector<VkDeviceMemory> dedicated_image_memory_blocks;
 };
 
-
 class VkStagingAllocator : public VkAllocator
 {
 public:
@@ -256,15 +265,14 @@ class VkStagingAllocator : public VkAllocator
 
     virtual VkBufferMemory* fastMalloc(size_t size);
     virtual void fastFree(VkBufferMemory* ptr);
-    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; }
+    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); //{ return 0; }
     virtual void fastFree(VkImageMemory* ptr);
 
 protected:
-    unsigned int size_compare_ratio;// 0~256
+    unsigned int size_compare_ratio; // 0~256
     std::list<VkBufferMemory*> buffer_budgets;
 };
 
-
 class VkWeightStagingAllocator : public VkAllocator
 {
 public:
@@ -274,11 +282,16 @@ class VkWeightStagingAllocator : public VkAllocator
 public:
     virtual VkBufferMemory* fastMalloc(size_t size);
     virtual void fastFree(VkBufferMemory* ptr);
-    virtual VkImageMemory* fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/) { return 0; }
-    virtual void fastFree(VkImageMemory* /*ptr*/) {}
+    virtual VkImageMemory* fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
+    {
+        return 0;
+    }
+    virtual void fastFree(VkImageMemory* /*ptr*/)
+    {
+    }
 
 protected:
 };
 
-}
+} // namespace TEngine
 #endif
diff --git a/source/device/vulkan/vulkan_command.cpp b/source/device/vulkan/vulkan_command.cpp
index b5545fe6b..05a7299ea 100644
--- a/source/device/vulkan/vulkan_command.cpp
+++ b/source/device/vulkan/vulkan_command.cpp
@@ -31,7 +31,8 @@
 
 namespace TEngine {
 
-VkCompute::VkCompute(const GPUDevice* _vkdev) : vkdev(_vkdev)
+VkCompute::VkCompute(const GPUDevice* _vkdev)
+    : vkdev(_vkdev)
 {
     compute_command_pool = 0;
     compute_command_buffer = 0;
@@ -40,10 +41,9 @@ VkCompute::VkCompute(const GPUDevice* _vkdev) : vkdev(_vkdev)
     init();
 }
 
-
 VkCompute::~VkCompute()
 {
-    for (size_t i=0; i<image_blocks_to_destroy.size(); i++)
+    for (size_t i = 0; i < image_blocks_to_destroy.size(); i++)
     {
         VkImageMemory* ptr = image_blocks_to_destroy[i];
 
@@ -65,7 +65,7 @@ VkCompute::~VkCompute()
 
     if (!vkdev->info.support_VK_KHR_push_descriptor)
     {
-        for (size_t i=0; i<descriptorsets.size(); i++)
+        for (size_t i = 0; i < descriptorsets.size(); i++)
         {
             vkFreeDescriptorSets(vkdev->vkdevice(), descriptor_pools[i], 1, &descriptorsets[i]);
             vkDestroyDescriptorPool(vkdev->vkdevice(), descriptor_pools[i], 0);
@@ -82,76 +82,76 @@ void VkCompute::record_upload(tensor* src, VkTensor& dst, const Option& opt)
 {
     Tensor src_tensor = Tensor(src);
     record_upload(src_tensor, dst, opt);
-//     // const ir_tensor* src_fp16;
-//     // if (src.elemsize == src.elempack * 4u)
-//     if(src->elem_size == opt.elempack * 4u)
-//     {
-//         // cpu cast to fp16 (discrete gpu)
-//         if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && opt.elempack % 4 == 0)))
-//         {
-//             // ncnn::cast_float32_to_float16(src, src_fp16, opt);
-//             printf("need to add cast_float32_to_float16 here, fix me!\n");
-//         }
-//         else
-//         {
-//             // src_fp16 = src;
-//         }
-//     }
-//     else
-//     {
-//         // src_fp16 = src;
-//     }
-
-//     // upload
-//     VkTensor dst_staging;
-//     if (opt.blob_vkallocator->mappable)
-//     {
-//         // dst_staging.create_like(src_fp16, opt.blob_vkallocator);
-//         dst_staging.create_like(src, opt.blob_vkallocator);
-//     }
-//     else
-//     {
-//         // dst_staging.create_like(src_fp16, opt.staging_vkallocator);
-//         dst_staging.create_like(src, opt.staging_vkallocator);
-//     }
-//     if (dst_staging.empty())
-//         return;
-
-//     // stash staging
-//     upload_staging_buffers.push_back(dst_staging);
-
-// //     TLOG_INFO("upload_staging_buffer %p  ->   %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity());
-
-//     // memcpy src to device
-//     // memcpy(dst_staging.mapped_ptr(), src_fp16->data, src_fp16->elem_size * src_fp16->elem_num);
-//     memcpy(dst_staging.mapped_ptr(), src->data, src->elem_size * src->elem_num);
-//     dst_staging.allocator->flush(dst_staging.data);
-
-//     // mark device host-write @ null
-//     dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT;
-//     dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
-
-//     // TODO
-//     // not use pack for now------------------------
-//     // // resolve dst_elempack
-//     int dims = src->dim_num;
-//     int elemcount = 0;
-//     // src dims[0-3]  n c h w
-//     // if (dims == 1) elemcount = opt.elempack * src_fp16.w;
-//     // if (dims == 2) elemcount = opt.elempack * src_fp16.h;
-//     // if (dims == 3) elemcount = opt.elempack * src_fp16.c;
-//     if(dims == 4) 
-//         elemcount = opt.elempack * src->dims[1];
-//     else 
-//         elemcount = opt.elempack * src->dims[0];
-
-//     int dst_elempack = 1;
-//     if (opt.use_shader_pack8)
-//         dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1;
-//     else
-//         dst_elempack = elemcount % 4 == 0 ? 4 : 1;
-
-//     vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt);
+    //     // const ir_tensor* src_fp16;
+    //     // if (src.elemsize == src.elempack * 4u)
+    //     if(src->elem_size == opt.elempack * 4u)
+    //     {
+    //         // cpu cast to fp16 (discrete gpu)
+    //         if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && opt.elempack % 4 == 0)))
+    //         {
+    //             // ncnn::cast_float32_to_float16(src, src_fp16, opt);
+    //             printf("need to add cast_float32_to_float16 here, fix me!\n");
+    //         }
+    //         else
+    //         {
+    //             // src_fp16 = src;
+    //         }
+    //     }
+    //     else
+    //     {
+    //         // src_fp16 = src;
+    //     }
+
+    //     // upload
+    //     VkTensor dst_staging;
+    //     if (opt.blob_vkallocator->mappable)
+    //     {
+    //         // dst_staging.create_like(src_fp16, opt.blob_vkallocator);
+    //         dst_staging.create_like(src, opt.blob_vkallocator);
+    //     }
+    //     else
+    //     {
+    //         // dst_staging.create_like(src_fp16, opt.staging_vkallocator);
+    //         dst_staging.create_like(src, opt.staging_vkallocator);
+    //     }
+    //     if (dst_staging.empty())
+    //         return;
+
+    //     // stash staging
+    //     upload_staging_buffers.push_back(dst_staging);
+
+    // //     TLOG_INFO("upload_staging_buffer %p  ->   %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity());
+
+    //     // memcpy src to device
+    //     // memcpy(dst_staging.mapped_ptr(), src_fp16->data, src_fp16->elem_size * src_fp16->elem_num);
+    //     memcpy(dst_staging.mapped_ptr(), src->data, src->elem_size * src->elem_num);
+    //     dst_staging.allocator->flush(dst_staging.data);
+
+    //     // mark device host-write @ null
+    //     dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT;
+    //     dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
+
+    //     // TODO
+    //     // not use pack for now------------------------
+    //     // // resolve dst_elempack
+    //     int dims = src->dim_num;
+    //     int elemcount = 0;
+    //     // src dims[0-3]  n c h w
+    //     // if (dims == 1) elemcount = opt.elempack * src_fp16.w;
+    //     // if (dims == 2) elemcount = opt.elempack * src_fp16.h;
+    //     // if (dims == 3) elemcount = opt.elempack * src_fp16.c;
+    //     if(dims == 4)
+    //         elemcount = opt.elempack * src->dims[1];
+    //     else
+    //         elemcount = opt.elempack * src->dims[0];
+
+    //     int dst_elempack = 1;
+    //     if (opt.use_shader_pack8)
+    //         dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1;
+    //     else
+    //         dst_elempack = elemcount % 4 == 0 ? 4 : 1;
+
+    //     vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt);
 }
 
 void VkCompute::record_upload(const Tensor& src, VkTensor& dst, const Option& opt)
@@ -193,7 +193,7 @@ void VkCompute::record_upload(const Tensor& src, VkTensor& dst, const Option& op
     // stash staging
     upload_staging_buffers.push_back(dst_staging);
 
-//     TLOG_INFO("upload_staging_buffer %p  ->   %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity());
+    //     TLOG_INFO("upload_staging_buffer %p  ->   %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity());
 
     // memcpy src to device
     memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize);
@@ -212,10 +212,11 @@ void VkCompute::record_upload(const Tensor& src, VkTensor& dst, const Option& op
 
     int dst_elempack = 1;
     if (opt.use_shader_pack8)
-        dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1;
+        dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4
+                                                                   : 1;
     else
         dst_elempack = elemcount % 4 == 0 ? 4 : 1;
-    
+
     // gpu cast to fp16 on the fly (integrated gpu)
     vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt);
 }
@@ -384,71 +385,71 @@ int VkCompute::submit_and_wait()
         // printf("delayed_records count:%d\n", record_count);
 
         // handle delayed records
-        for (size_t i=0; i<record_count; i++)
+        for (size_t i = 0; i < record_count; i++)
         {
             const record& r = delayed_records[i];
 
             switch (r.type)
             {
-                case record::TYPE_copy_buffer:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_copy_image:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_copy_buffer_to_image:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_copy_image_to_buffer:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_bind_pipeline:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_bind_descriptorsets:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_push_constants:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_dispatch:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_memory_barrers:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_buffer_barrers:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_image_barrers:
-                {
-                    // TODO
-                    break;
-                }
-                case record::TYPE_post_download:
-                case record::TYPE_post_cast_float16_to_float32:
-                default:
-                    break;	
+            case record::TYPE_copy_buffer:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_copy_image:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_copy_buffer_to_image:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_copy_image_to_buffer:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_bind_pipeline:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_bind_descriptorsets:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_push_constants:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_dispatch:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_memory_barrers:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_buffer_barrers:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_image_barrers:
+            {
+                // TODO
+                break;
+            }
+            case record::TYPE_post_download:
+            case record::TYPE_post_cast_float16_to_float32:
+            default:
+                break;
             }
         }
     }
@@ -500,32 +501,32 @@ int VkCompute::submit_and_wait()
     }
 
     // handle delayed post records
-    for (size_t i=0; i<delayed_records.size(); i++)
+    for (size_t i = 0; i < delayed_records.size(); i++)
     {
         const record& r = delayed_records[i];
 
         switch (r.type)
         {
-            case record::TYPE_post_download:
-            {
-                const VkTensor& src = download_post_buffers[r.post_download.download_post_buffer_mat_offset];
-                Tensor dst = download_post_tensors_fp16[r.post_download.download_post_mat_fp16_offset];
+        case record::TYPE_post_download:
+        {
+            const VkTensor& src = download_post_buffers[r.post_download.download_post_buffer_mat_offset];
+            Tensor dst = download_post_tensors_fp16[r.post_download.download_post_mat_fp16_offset];
 
-    //             TLOG_INFO("post_download  %p +%d ~%d  -> %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data);
+            //             TLOG_INFO("post_download  %p +%d ~%d  -> %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data);
 
-                src.allocator->invalidate(src.data);
-                // memcpy(dst.data, src.mapped_ptr(), dst.elem_size * dst.elem_num);
-                memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize);
-                break;
-            }
-            case record::TYPE_post_cast_float16_to_float32:
-            {
-                // TODO
-                printf("submit delayed_records TYPE_post_cast_float16_to_float32, Do nothing, fix me\n");
-                break;
-            }
-            default:
-                break;
+            src.allocator->invalidate(src.data);
+            // memcpy(dst.data, src.mapped_ptr(), dst.elem_size * dst.elem_num);
+            memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize);
+            break;
+        }
+        case record::TYPE_post_cast_float16_to_float32:
+        {
+            // TODO
+            printf("submit delayed_records TYPE_post_cast_float16_to_float32, Do nothing, fix me\n");
+            break;
+        }
+        default:
+            break;
         }
     }
 
@@ -534,7 +535,6 @@ int VkCompute::submit_and_wait()
     return 0;
 }
 
-
 int VkCompute::init()
 {
     // compute_command_pool
@@ -664,7 +664,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTe
 
     int buffer_index = 0;
     int image_index = 0;
-    for (int i=0; i<binding_count; i++)
+    for (int i = 0; i < binding_count; i++)
     {
         int binding_type = pipeline->shader_info.binding_types[i];
 
@@ -673,7 +673,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTe
             const VkTensor& binding = buffer_bindings[buffer_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[buffer_index];
             buffer_index++;
 
-//             TLOG_INFO("binding #%d buffer = %d %d %d %d @ %lu %d = %p +%ld ~%ld", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.buffer(), binding.buffer_offset(), binding.buffer_capacity());
+            //             TLOG_INFO("binding #%d buffer = %d %d %d %d @ %lu %d = %p +%ld ~%ld", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.buffer(), binding.buffer_offset(), binding.buffer_capacity());
 
             if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
             {
@@ -719,7 +719,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTe
             const VkImageTensor& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image() : image_bindings[image_index];
             image_index++;
 
-//             TLOG_INFO("binding #%d image = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview());
+            //             TLOG_INFO("binding #%d image = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview());
 
             if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
             {
@@ -775,11 +775,11 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTe
             const VkImageTensor& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image() : image_bindings[image_index];
             image_index++;
 
-//             TLOG_INFO("binding #%d sampler = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview());
+            //             TLOG_INFO("binding #%d sampler = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview());
 
             // if the same image used for both storage image and combined image sampler
             // only apply image layout transition to general
-            for (int j=0; j<image_binding_count; j++)
+            for (int j = 0; j < image_binding_count; j++)
             {
                 if (pipeline->shader_info.binding_types[j] == 2 && binding.data == image_bindings[j].data)
                 {
@@ -865,7 +865,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTe
             unsigned char* p_descriptorInfos = descriptorInfos.data();
             int descriptorBufferInfo_index = 0;
             int descriptorImageInfo_index = 0;
-            for (int i=0; i<binding_count; i++)
+            for (int i = 0; i < binding_count; i++)
             {
                 int binding_type = pipeline->shader_info.binding_types[i];
 
@@ -910,7 +910,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTe
             {
                 int image_binding_count = 0;
                 int sampler_binding_count = 0;
-                for (int i=0; i<binding_count; i++)
+                for (int i = 0; i < binding_count; i++)
                 {
                     int binding_type = pipeline->shader_info.binding_types[i];
 
@@ -972,7 +972,7 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTe
                 std::vector<VkWriteDescriptorSet> writeDescriptorSets(binding_count);
                 {
                     const unsigned char* p_descriptorInfos = descriptorInfos.data();
-                    for (int i=0; i<binding_count; i++)
+                    for (int i = 0; i < binding_count; i++)
                     {
                         int binding_type = pipeline->shader_info.binding_types[i];
 
@@ -1072,7 +1072,8 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTe
     }
 }
 
-VkTransfer::VkTransfer(const GPUDevice* _vkdev) : vkdev(_vkdev)
+VkTransfer::VkTransfer(const GPUDevice* _vkdev)
+    : vkdev(_vkdev)
 {
     compute_command_pool = 0;
     transfer_command_pool = 0;
@@ -1153,7 +1154,7 @@ int VkTransfer::init()
         {
             printf("vkCreateFence failed %d", ret);
             return -1;
-        } 
+        }
     }
 
     if (!vkdev->info.unified_compute_transfer_queue)
@@ -1174,8 +1175,8 @@ int VkTransfer::init()
             }
         }
 
-    // upload_command_buffer
-    {
+        // upload_command_buffer
+        {
             VkCommandBufferAllocateInfo commandBufferAllocateInfo;
             commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
             commandBufferAllocateInfo.pNext = 0;
@@ -1189,10 +1190,10 @@ int VkTransfer::init()
                 printf("vkAllocateCommandBuffers failed %d", ret);
                 return -1;
             }
-    }
+        }
 
-    // upload_compute_semaphore
-    {
+        // upload_compute_semaphore
+        {
             VkSemaphoreCreateInfo semaphoreCreateInfo;
             semaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
             semaphoreCreateInfo.pNext = 0;
@@ -1200,15 +1201,15 @@ int VkTransfer::init()
 
             VkResult ret = vkCreateSemaphore(vkdev->vkdevice(), &semaphoreCreateInfo, 0, &upload_compute_semaphore);
 
-        if (ret != VK_SUCCESS)
-        {
+            if (ret != VK_SUCCESS)
+            {
                 printf("vkCreateSemaphore failed %d", ret);
-        return -1;
+                return -1;
+            }
         }
-    }
 
-    // upload_command_fence
-    {
+        // upload_command_fence
+        {
             VkFenceCreateInfo fenceCreateInfo;
             fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
             fenceCreateInfo.pNext = 0;
@@ -1216,13 +1217,13 @@ int VkTransfer::init()
 
             VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &upload_command_fence);
 
-        if (ret != VK_SUCCESS)
+            if (ret != VK_SUCCESS)
             {
                 printf("vkCreateFence failed %d", ret);
                 return -1;
+            }
         }
     }
-    }
 
     begin_command_buffer();
 
@@ -1266,7 +1267,6 @@ int VkTransfer::begin_command_buffer()
     return 0;
 }
 
-
 int VkTransfer::end_command_buffer()
 {
     {
@@ -1362,9 +1362,9 @@ int VkTransfer::submit_and_wait()
                 return -1;
             }
         }
-        
+
         {
-            VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;// FIXME
+            VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; // FIXME
             VkSubmitInfo submitInfo;
             submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
             submitInfo.pNext = 0;
@@ -1386,11 +1386,11 @@ int VkTransfer::submit_and_wait()
                 return -1;
             }
         }
-        
+
         vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue);
     }
     vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue);
-    
+
     // wait
     if (vkdev->info.unified_compute_transfer_queue)
     {
@@ -1403,7 +1403,7 @@ int VkTransfer::submit_and_wait()
     }
     else
     {
-        VkFence fences[2] = { upload_command_fence, compute_command_fence };
+        VkFence fences[2] = {upload_command_fence, compute_command_fence};
 
         VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, UINT64_MAX);
         if (ret != VK_SUCCESS)
@@ -1417,7 +1417,7 @@ int VkTransfer::submit_and_wait()
 
 void VkTransfer::record_upload(const Tensor& src, VkTensor& dst, const Option& opt)
 {
-//     TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack);
+    //     TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack);
 
     // NOTE keep the hack here ?
     if (src.elemsize == src.elempack * 4u)
@@ -1596,7 +1596,7 @@ void VkTransfer::record_upload(const Tensor& src, VkTensor& dst, const Option& o
 
 void VkTransfer::record_upload(const tensor* src, VkTensor& dst, const Option& opt)
 {
-//     TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack);
+    //     TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack);
 
     // NOTE keep the hack here ?
     // printf("elem size: %d, elempack:%d\n", src.elemsize, src.elempack);
diff --git a/source/device/vulkan/vulkan_command.hpp b/source/device/vulkan/vulkan_command.hpp
index 1f5e82e06..345371066 100644
--- a/source/device/vulkan/vulkan_command.hpp
+++ b/source/device/vulkan/vulkan_command.hpp
@@ -55,7 +55,7 @@ class VkCompute
     void record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& buffer_bindings, const std::vector<VkImageTensor>& image_bindings, const std::vector<vk_constant_type>& constants, const VkTensor& dispatcher);
     void record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& buffer_bindings, const std::vector<VkImageTensor>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageTensor& dispatcher);
     void record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& buffer_bindings, const std::vector<VkImageTensor>& image_bindings, const std::vector<vk_constant_type>& constants, int dispatcher_w, int dispatcher_h, int dispatcher_c);
-    
+
     int submit_and_wait();
 
     int reset();
@@ -106,35 +106,110 @@ class VkCompute
 
         union
         {
-            struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer;
-            struct { VkImage src; VkImageLayout src_layout; VkImage dst; VkImageLayout dst_layout; uint32_t region_count; const VkImageCopy* regions; } copy_image;
-            struct { VkBuffer src; VkImage dst; VkImageLayout layout; uint32_t region_count; const VkBufferImageCopy* regions; } copy_buffer_to_image;
-            struct { VkImage src; VkImageLayout layout; VkBuffer dst; uint32_t region_count; const VkBufferImageCopy* regions; } copy_image_to_buffer;
-
-            struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline;
-            struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets;
-            struct { VkPipelineLayout pipeline_layout; VkShaderStageFlags stage_flags; uint32_t size; const void* values; } push_constants;
-
-            struct { uint32_t group_count_x; uint32_t group_count_y; uint32_t group_count_z; } dispatch;
-
-            struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkMemoryBarrier* barriers; } memory_barrers;
-            struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkBufferMemoryBarrier* barriers; } buffer_barrers;
-            struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkImageMemoryBarrier* barriers; } image_barrers;
-
-            struct { uint32_t download_post_buffer_mat_offset; uint32_t download_post_mat_fp16_offset; } post_download;
-            struct { uint32_t download_post_mat_fp16_offset; uint32_t download_post_mat_offset; } post_cast_float16_to_float32;
+            struct
+            {
+                VkBuffer src;
+                VkBuffer dst;
+                uint32_t region_count;
+                const VkBufferCopy* regions;
+            } copy_buffer;
+            struct
+            {
+                VkImage src;
+                VkImageLayout src_layout;
+                VkImage dst;
+                VkImageLayout dst_layout;
+                uint32_t region_count;
+                const VkImageCopy* regions;
+            } copy_image;
+            struct
+            {
+                VkBuffer src;
+                VkImage dst;
+                VkImageLayout layout;
+                uint32_t region_count;
+                const VkBufferImageCopy* regions;
+            } copy_buffer_to_image;
+            struct
+            {
+                VkImage src;
+                VkImageLayout layout;
+                VkBuffer dst;
+                uint32_t region_count;
+                const VkBufferImageCopy* regions;
+            } copy_image_to_buffer;
+
+            struct
+            {
+                VkPipelineBindPoint bind_point;
+                VkPipeline pipeline;
+            } bind_pipeline;
+            struct
+            {
+                VkPipelineBindPoint bind_point;
+                VkPipelineLayout pipeline_layout;
+                uint32_t descriptorset_count;
+                uint32_t descriptorset_offset;
+            } bind_descriptorsets;
+            struct
+            {
+                VkPipelineLayout pipeline_layout;
+                VkShaderStageFlags stage_flags;
+                uint32_t size;
+                const void* values;
+            } push_constants;
+
+            struct
+            {
+                uint32_t group_count_x;
+                uint32_t group_count_y;
+                uint32_t group_count_z;
+            } dispatch;
+
+            struct
+            {
+                VkPipelineStageFlags src_stage;
+                VkPipelineStageFlags dst_stage;
+                uint32_t barrier_count;
+                const VkMemoryBarrier* barriers;
+            } memory_barrers;
+            struct
+            {
+                VkPipelineStageFlags src_stage;
+                VkPipelineStageFlags dst_stage;
+                uint32_t barrier_count;
+                const VkBufferMemoryBarrier* barriers;
+            } buffer_barrers;
+            struct
+            {
+                VkPipelineStageFlags src_stage;
+                VkPipelineStageFlags dst_stage;
+                uint32_t barrier_count;
+                const VkImageMemoryBarrier* barriers;
+            } image_barrers;
+
+            struct
+            {
+                uint32_t download_post_buffer_mat_offset;
+                uint32_t download_post_mat_fp16_offset;
+            } post_download;
+            struct
+            {
+                uint32_t download_post_mat_fp16_offset;
+                uint32_t download_post_mat_offset;
+            } post_cast_float16_to_float32;
         };
     };
 
     std::vector<record> delayed_records;
 };
 
-
 class VkTransfer
 {
 public:
     VkTransfer(const GPUDevice* vkdev);
     ~VkTransfer();
+
 public:
     void record_upload(const tensor* src, VkTensor& dst, const Option& opt);
     void record_upload(const Tensor& src, VkTensor& dst, const Option& opt);
diff --git a/source/device/vulkan/vulkan_define.h b/source/device/vulkan/vulkan_define.h
index e0c68277a..68de6df99 100644
--- a/source/device/vulkan/vulkan_define.h
+++ b/source/device/vulkan/vulkan_define.h
@@ -26,9 +26,8 @@
 
 #define VULKAN_DEV_NAME "VK"
 
-
 typedef struct vulkan_option
 {
     char* dev_name;
-    int precision;      //!< precision of calculation
+    int precision; //!< precision of calculation
 } vulkan_opt_t;
diff --git a/source/device/vulkan/vulkan_device.hpp b/source/device/vulkan/vulkan_device.hpp
index 9560261fe..1fee0d5e1 100644
--- a/source/device/vulkan/vulkan_device.hpp
+++ b/source/device/vulkan/vulkan_device.hpp
@@ -26,8 +26,7 @@
 
 #include "vulkan_define.h"
 
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "device/device.h"
 
diff --git a/source/device/vulkan/vulkan_executor.hpp b/source/device/vulkan/vulkan_executor.hpp
index 28ae46efb..c4cc99a6c 100644
--- a/source/device/vulkan/vulkan_executor.hpp
+++ b/source/device/vulkan/vulkan_executor.hpp
@@ -22,9 +22,7 @@
  * Author: lswang@openailab.com
  */
 
-
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "device/device.h"
 #include "graph/tensor.h"
@@ -57,15 +55,15 @@ struct VULKANqueue
     int dims;
     // cl_kernel queue_kernel;
     // cl_event enentPoint;
-    size_t *queue_global_work_size;
-    size_t *queue_local_work_size;
+    size_t* queue_global_work_size;
+    size_t* queue_local_work_size;
 };
 
 class VULKANEngine
 {
 public:
-//    VULKANEngine();
-//    ~VULKANEngine() = default;
+    //    VULKANEngine();
+    //    ~VULKANEngine() = default;
 
     int VULKANEnginePreRun(struct subgraph* subgraph);
     int VULKANEngineRun(struct subgraph* subgraph);
@@ -75,15 +73,10 @@ class VULKANEngine
     bool init();
 
 private:
-
 public:
     // dict_uint2clmem             vulkan_tensor_map;
-    std::vector<struct VULKANqueue>    queue_list;
+    std::vector<struct VULKANqueue> queue_list;
 
 public:
     int bin_num;
-
 };
-
-
-
diff --git a/source/device/vulkan/vulkan_gpu.cpp b/source/device/vulkan/vulkan_gpu.cpp
index dac4e9486..fba68aa70 100644
--- a/source/device/vulkan/vulkan_gpu.cpp
+++ b/source/device/vulkan/vulkan_gpu.cpp
@@ -80,8 +80,7 @@ struct layer_shader_registry_entry
 
 #include "layer_shader_spv_data.h"
 
-static const layer_shader_registry_entry layer_shader_registry[] =
-{
+static const layer_shader_registry_entry layer_shader_registry[] = {
 #include "layer_shader_registry.h"
 };
 
@@ -130,21 +129,23 @@ PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0;
 // compile with old vulkan sdk
 #if VK_HEADER_VERSION < 80
 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
-typedef struct VkPhysicalDevice8BitStorageFeaturesKHR {
-    VkStructureType    sType;
-    void*              pNext;
-    VkBool32           storageBuffer8BitAccess;
-    VkBool32           uniformAndStorageBuffer8BitAccess;
-    VkBool32           storagePushConstant8;
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
 } VkPhysicalDevice8BitStorageFeaturesKHR;
 #endif // VK_HEADER_VERSION < 80
 #if VK_HEADER_VERSION < 95
 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
-typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR {
-    VkStructureType    sType;
-    void*              pNext;
-    VkBool32           shaderFloat16;
-    VkBool32           shaderInt8;
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
 } VkPhysicalDeviceFloat16Int8FeaturesKHR;
 #endif // VK_HEADER_VERSION < 95
 
@@ -157,7 +158,7 @@ static int init_instance_extension()
 
     if (support_VK_KHR_get_physical_device_properties2)
     {
-	vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR");
+        vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR");
         vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR");
         vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR");
         vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR");
@@ -225,19 +226,19 @@ void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT
 static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
 {
     // first try, compute only queue
-    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
     {
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
         if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
-        && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
+            && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
         {
             return i;
         }
     }
 
     // second try, any queue with compute and graphics
-    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
     {
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
@@ -249,7 +250,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
     }
 
     // third try, any queue with compute
-    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
     {
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
@@ -265,7 +266,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
 static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
 {
     // first try, graphics only queue
-    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
     {
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
@@ -277,7 +278,7 @@ static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProper
     }
 
     // second try, any queue with graphics and compute
-    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
     {
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
@@ -289,7 +290,7 @@ static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProper
     }
 
     // third try, any queue with graphics
-    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
     {
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
@@ -299,14 +300,14 @@ static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProper
         }
     }
 
-//     TLOG_INFO("no graphics queue\n");
+    //     TLOG_INFO("no graphics queue\n");
     return -1;
 }
 
 static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
 {
     // first try, transfer only queue
-    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
     {
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
@@ -319,7 +320,7 @@ static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProper
     }
 
     // second try, any queue with transfer
-    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
     {
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
@@ -343,21 +344,21 @@ static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProper
         return graphics_queue_index;
     }
 
-//     TLOG_INFO("no transfer queue\n");
+    //     TLOG_INFO("no transfer queue\n");
     return -1;
 }
 
 static int find_default_vulkan_device_index()
 {
     // first try, discrete gpu
-    for (int i=0; i<g_gpu_count; i++)
+    for (int i = 0; i < g_gpu_count; i++)
     {
         if (g_gpu_infos[i].type == 0)
             return i;
     }
 
     // second try, integrated gpu
-    for (int i=0; i<g_gpu_count; i++)
+    for (int i = 0; i < g_gpu_count; i++)
     {
         if (g_gpu_infos[i].type == 1)
             return i;
@@ -394,10 +395,10 @@ int create_gpu_instance()
         return -1;
     }
 
-    for (uint32_t i=0; i<instanceLayerPropertyCount; i++)
+    for (uint32_t i = 0; i < instanceLayerPropertyCount; i++)
     {
         const VkLayerProperties& lp = instanceLayerProperties[i];
-//         TLOG_INFO("instance layer %s = %u\n", lp.layerName, lp.implementationVersion);
+        //         TLOG_INFO("instance layer %s = %u\n", lp.layerName, lp.implementationVersion);
 
         if (strcmp(lp.layerName, "VK_LAYER_LUNARG_standard_validation") == 0)
         {
@@ -435,10 +436,10 @@ int create_gpu_instance()
 #if __ANDROID_API__ >= 26
     support_VK_KHR_android_surface = 0;
 #endif // __ANDROID_API__ >= 26
-    for (uint32_t j=0; j<instanceExtensionPropertyCount; j++)
+    for (uint32_t j = 0; j < instanceExtensionPropertyCount; j++)
     {
         const VkExtensionProperties& exp = instanceExtensionProperties[j];
-//         TLOG_INFO("instance extension %s = %u\n", exp.extensionName, exp.specVersion);
+        //         TLOG_INFO("instance extension %s = %u\n", exp.extensionName, exp.specVersion);
 
         if (strcmp(exp.extensionName, "VK_KHR_external_memory_capabilities") == 0)
             support_VK_KHR_external_memory_capabilities = exp.specVersion;
@@ -541,7 +542,7 @@ int create_gpu_instance()
 
     // find proper device and queue
     int gpu_info_index = 0;
-    for (uint32_t i=0; i<physicalDeviceCount; i++)
+    for (uint32_t i = 0; i < physicalDeviceCount; i++)
     {
         const VkPhysicalDevice& physicalDevice = physicalDevices[i];
         GpuInfo& gpu_info = g_gpu_infos[gpu_info_index];
@@ -550,15 +551,15 @@ int create_gpu_instance()
         VkPhysicalDeviceProperties physicalDeviceProperties;
         vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties);
 
-//         TLOG_INFO("[%u] apiVersion = %u.%u.%u\n", i, VK_VERSION_MAJOR(physicalDeviceProperties.apiVersion),
-//             VK_VERSION_MINOR(physicalDeviceProperties.apiVersion), VK_VERSION_PATCH(physicalDeviceProperties.apiVersion));
-//         TLOG_INFO("[%u] driverVersion = %u.%u.%u\n", i, VK_VERSION_MAJOR(physicalDeviceProperties.driverVersion),
-//             VK_VERSION_MINOR(physicalDeviceProperties.driverVersion), VK_VERSION_PATCH(physicalDeviceProperties.driverVersion));
-//         TLOG_INFO("[%u] vendorID = %x\n", i, physicalDeviceProperties.vendorID);
-//         TLOG_INFO("[%u] deviceID = %x\n", i, physicalDeviceProperties.deviceID);
-//         TLOG_INFO("[%u] deviceType = %x\n", i, physicalDeviceProperties.deviceType);
-//         TLOG_INFO("[%u] deviceName = %s\n", i, physicalDeviceProperties.deviceName);
-//         TLOG_INFO("[%u] pipelineCacheUUID = %u\n", i, physicalDeviceProperties.pipelineCacheUUID);
+        //         TLOG_INFO("[%u] apiVersion = %u.%u.%u\n", i, VK_VERSION_MAJOR(physicalDeviceProperties.apiVersion),
+        //             VK_VERSION_MINOR(physicalDeviceProperties.apiVersion), VK_VERSION_PATCH(physicalDeviceProperties.apiVersion));
+        //         TLOG_INFO("[%u] driverVersion = %u.%u.%u\n", i, VK_VERSION_MAJOR(physicalDeviceProperties.driverVersion),
+        //             VK_VERSION_MINOR(physicalDeviceProperties.driverVersion), VK_VERSION_PATCH(physicalDeviceProperties.driverVersion));
+        //         TLOG_INFO("[%u] vendorID = %x\n", i, physicalDeviceProperties.vendorID);
+        //         TLOG_INFO("[%u] deviceID = %x\n", i, physicalDeviceProperties.deviceID);
+        //         TLOG_INFO("[%u] deviceType = %x\n", i, physicalDeviceProperties.deviceType);
+        //         TLOG_INFO("[%u] deviceName = %s\n", i, physicalDeviceProperties.deviceName);
+        //         TLOG_INFO("[%u] pipelineCacheUUID = %u\n", i, physicalDeviceProperties.pipelineCacheUUID);
 
         gpu_info.bug_local_size_spec_const = false;
         gpu_info.bug_implicit_fp16_arithmetic = false;
@@ -693,10 +694,10 @@ int create_gpu_instance()
 #if __ANDROID_API__ >= 26
         gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = 0;
 #endif // __ANDROID_API__ >= 26
-        for (uint32_t j=0; j<deviceExtensionPropertyCount; j++)
+        for (uint32_t j = 0; j < deviceExtensionPropertyCount; j++)
         {
             const VkExtensionProperties& exp = deviceExtensionProperties[j];
-//             TLOG_INFO("device extension %s = %u\n", exp.extensionName, exp.specVersion);
+            //             TLOG_INFO("device extension %s = %u\n", exp.extensionName, exp.specVersion);
 
             if (strcmp(exp.extensionName, "VK_KHR_8bit_storage") == 0)
                 gpu_info.support_VK_KHR_8bit_storage = exp.specVersion;
@@ -811,9 +812,9 @@ int create_gpu_instance()
         }
         else
         {
-//             // TODO
-//             VkPhysicalDeviceFeatures features;
-//             vkGetPhysicalDeviceFeatures(physicalDevice, &features);
+            //             // TODO
+            //             VkPhysicalDeviceFeatures features;
+            //             vkGetPhysicalDeviceFeatures(physicalDevice, &features);
         }
 
         if (physicalDeviceProperties.vendorID == 0x13b5)
@@ -835,16 +836,16 @@ int create_gpu_instance()
         }
 
         TLOG_INFO("[%u %s]  queueC=%u[%u]  queueG=%u[%u]  queueT=%u[%u]\n", i, physicalDeviceProperties.deviceName,
-                gpu_info.compute_queue_family_index, gpu_info.compute_queue_count,
-                gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count,
-                gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count);
+                  gpu_info.compute_queue_family_index, gpu_info.compute_queue_count,
+                  gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count,
+                  gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count);
 
         TLOG_INFO("[%u %s]  buglssc=%d  bugihfa=%d\n", i, physicalDeviceProperties.deviceName,
-                gpu_info.bug_local_size_spec_const, gpu_info.bug_implicit_fp16_arithmetic);
+                  gpu_info.bug_local_size_spec_const, gpu_info.bug_implicit_fp16_arithmetic);
 
         TLOG_INFO("[%u %s]  fp16p=%d  fp16s=%d  fp16a=%d  int8s=%d  int8a=%d\n", i, physicalDeviceProperties.deviceName,
-                gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
-                gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
+                  gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
+                  gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
 
         gpu_info_index++;
     }
@@ -856,7 +857,7 @@ int create_gpu_instance()
 
     // resolve shader info
     // TLOG_INFO("run create gpu instance resolve shader info, layer_shader_registry_entry_count:%d\n", layer_shader_registry_entry_count);
-    for (int i=0; i<layer_shader_registry_entry_count; i++)
+    for (int i = 0; i < layer_shader_registry_entry_count; i++)
     {
         resolve_shader_info(layer_shader_registry[i].spv_data, layer_shader_registry[i].spv_data_size, layer_shader_infos[i]);
     }
@@ -866,7 +867,7 @@ int create_gpu_instance()
 
 void destroy_gpu_instance()
 {
-    for (int i=0; i<MAX_GPU_COUNT; i++)
+    for (int i = 0; i < MAX_GPU_COUNT; i++)
     {
         delete g_default_vkdev[i];
         g_default_vkdev[i] = 0;
@@ -897,7 +898,8 @@ const GpuInfo& get_gpu_info(int device_index)
     return g_gpu_infos[device_index];
 }
 
-GPUDevice::GPUDevice(int device_index) : info(g_gpu_infos[device_index])
+GPUDevice::GPUDevice(int device_index)
+    : info(g_gpu_infos[device_index])
 {
     std::vector<const char*> enabledExtensions;
     if (info.support_VK_KHR_8bit_storage)
@@ -986,9 +988,9 @@ GPUDevice::GPUDevice(int device_index) : info(g_gpu_infos[device_index])
         querySamplerYcbcrConversionFeatures.pNext = enabledExtensionFeatures;
         enabledExtensionFeatures = &querySamplerYcbcrConversionFeatures;
     }
-    std::vector<float> compute_queue_priorities(info.compute_queue_count, 1.f);// 0.f ~ 1.f
-    std::vector<float> graphics_queue_priorities(info.graphics_queue_count, 1.f);// 0.f ~ 1.f
-    std::vector<float> transfer_queue_priorities(info.transfer_queue_count, 1.f);// 0.f ~ 1.f
+    std::vector<float> compute_queue_priorities(info.compute_queue_count, 1.f);   // 0.f ~ 1.f
+    std::vector<float> graphics_queue_priorities(info.graphics_queue_count, 1.f); // 0.f ~ 1.f
+    std::vector<float> transfer_queue_priorities(info.transfer_queue_count, 1.f); // 0.f ~ 1.f
 
     VkDeviceQueueCreateInfo deviceQueueCreateInfos[3];
     VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo;
@@ -1048,7 +1050,7 @@ GPUDevice::GPUDevice(int device_index) : info(g_gpu_infos[device_index])
     deviceCreateInfo.ppEnabledLayerNames = 0;
     deviceCreateInfo.enabledExtensionCount = enabledExtensions.size();
     deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
-    deviceCreateInfo.pEnabledFeatures = 0;// VkPhysicalDeviceFeatures pointer
+    deviceCreateInfo.pEnabledFeatures = 0; // VkPhysicalDeviceFeatures pointer
 
     VkResult ret = vkCreateDevice(info.physical_device, &deviceCreateInfo, 0, &device);
     if (ret != VK_SUCCESS)
@@ -1066,7 +1068,7 @@ GPUDevice::GPUDevice(int device_index) : info(g_gpu_infos[device_index])
     for (uint32_t i = 0; i < info.compute_queue_count; i++)
     {
         vkGetDeviceQueue(device, info.compute_queue_family_index, i, &compute_queues[i]);
-	
+
         blob_allocators[i] = new VkBlobAllocator(this);
         staging_allocators[i] = new VkStagingAllocator(this);
     }
@@ -1265,13 +1267,10 @@ VkShaderModule GPUDevice::compile_shader_module(const uint32_t* spv_data, size_t
     return shader_module;
 }
 
-
-
-
 uint32_t GPUDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
 {
     // first try, find required and with preferred and without preferred_not
-    for (uint32_t i=0; i<info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
+    for (uint32_t i = 0; i < info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
     {
         bool is_required = (1 << i) & memory_type_bits;
         if (is_required)
@@ -1281,13 +1280,13 @@ uint32_t GPUDevice::find_memory_index(uint32_t memory_type_bits, VkFlags require
                 && (preferred && (memoryType.propertyFlags & preferred))
                 && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
             {
-										                    return i;
-										                }
+                return i;
+            }
         }
     }
 
     // second try, find required and with preferred
-    for (uint32_t i=0; i<info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
+    for (uint32_t i = 0; i < info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
     {
         bool is_required = (1 << i) & memory_type_bits;
         if (is_required)
@@ -1302,7 +1301,7 @@ uint32_t GPUDevice::find_memory_index(uint32_t memory_type_bits, VkFlags require
     }
 
     // third try, find required and without preferred_not
-    for (uint32_t i=0; i<info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
+    for (uint32_t i = 0; i < info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
     {
         bool is_required = (1 << i) & memory_type_bits;
         if (is_required)
@@ -1317,7 +1316,7 @@ uint32_t GPUDevice::find_memory_index(uint32_t memory_type_bits, VkFlags require
     }
 
     // fourth try, find any required
-    for (uint32_t i=0; i<info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
+    for (uint32_t i = 0; i < info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
     {
         bool is_required = (1 << i) & memory_type_bits;
         if (is_required)
@@ -1360,9 +1359,10 @@ VkQueue GPUDevice::acquire_queue(uint32_t queue_family_index) const
 
     MutexLockGuard lock(queue_lock);
 
-    std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index ? compute_queues
-	        : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues;
-    for (int i=0; i<(int)queues.size(); i++)
+    std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index    ? compute_queues
+                                   : queue_family_index == info.graphics_queue_family_index ? graphics_queues
+                                                                                            : transfer_queues;
+    for (int i = 0; i < (int)queues.size(); i++)
     {
         VkQueue queue = queues[i];
         if (queue)
@@ -1390,9 +1390,10 @@ void GPUDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const
     // TODO
     MutexLockGuard lock(queue_lock);
 
-    std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index ? compute_queues 
-		    : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues;
-    for (int i=0; i<(int)queues.size(); i++)
+    std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index    ? compute_queues
+                                   : queue_family_index == info.graphics_queue_family_index ? graphics_queues
+                                                                                            : transfer_queues;
+    for (int i = 0; i < (int)queues.size(); i++)
     {
         if (!queues[i])
         {
@@ -1408,7 +1409,7 @@ VkAllocator* GPUDevice::acquire_blob_allocator() const
 {
     MutexLockGuard lock(blob_allocator_lock);
 
-    for (int i=0; i<(int)blob_allocators.size(); i++)
+    for (int i = 0; i < (int)blob_allocators.size(); i++)
     {
         VkAllocator* allocator = blob_allocators[i];
         if (allocator)
@@ -1426,7 +1427,7 @@ void GPUDevice::reclaim_blob_allocator(VkAllocator* allocator) const
 {
     MutexLockGuard lock(blob_allocator_lock);
 
-    for (int i=0; i<(int)blob_allocators.size(); i++)
+    for (int i = 0; i < (int)blob_allocators.size(); i++)
     {
         if (!blob_allocators[i])
         {
@@ -1438,12 +1439,11 @@ void GPUDevice::reclaim_blob_allocator(VkAllocator* allocator) const
     TLOG_INFO("FATAL ERROR! reclaim_blob_allocator get wild allocator %p", allocator);
 }
 
-
 VkAllocator* GPUDevice::acquire_staging_allocator() const
 {
     MutexLockGuard lock(staging_allocator_lock);
 
-    for (int i=0; i<(int)staging_allocators.size(); i++)
+    for (int i = 0; i < (int)staging_allocators.size(); i++)
     {
         VkAllocator* allocator = staging_allocators[i];
         if (allocator)
@@ -1457,12 +1457,11 @@ VkAllocator* GPUDevice::acquire_staging_allocator() const
     return 0;
 }
 
-
 void GPUDevice::reclaim_staging_allocator(VkAllocator* allocator) const
 {
     MutexLockGuard lock(staging_allocator_lock);
 
-    for (int i=0; i<(int)staging_allocators.size(); i++)
+    for (int i = 0; i < (int)staging_allocators.size(); i++)
     {
         if (!staging_allocators[i])
         {
@@ -1483,7 +1482,7 @@ int GPUDevice::create_shader_module()
     }
 
     shader_modules.resize(layer_shader_registry_entry_count, VK_NULL_HANDLE);
-    for (int i=0; i<layer_shader_registry_entry_count; i++)
+    for (int i = 0; i < layer_shader_registry_entry_count; i++)
     {
         // add_shader cmake macro
         // 0 = fp32
@@ -1491,14 +1490,14 @@ int GPUDevice::create_shader_module()
         // 2 = fp16pa
         // 3 = fp16s
         // 4 = fp16sa
-        
+
         if (!info.support_fp16_packed)
         {
             if (i % 5 == 1)
                 continue;
         }
 
-	    if (!info.support_fp16_packed || !info.support_fp16_arithmetic)
+        if (!info.support_fp16_packed || !info.support_fp16_arithmetic)
         {
             if (i % 5 == 2)
                 continue;
@@ -1532,7 +1531,7 @@ int GPUDevice::create_shader_module()
 
 void GPUDevice::destroy_shader_module()
 {
-    for (int i=0; i<(int)shader_modules.size(); i++)
+    for (int i = 0; i < (int)shader_modules.size(); i++)
     {
         vkDestroyShaderModule(device, shader_modules[i], 0);
     }
@@ -1754,9 +1753,9 @@ int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderIn
     shader_info.push_constant_count = push_constant_count;
 
     // resolve binding_types
-    for (int i=0; i<binding_count; i++)
+    for (int i = 0; i < binding_count; i++)
     {
-        shader_info.binding_types[i] = id_types[ binding_types[i] ];
+        shader_info.binding_types[i] = id_types[binding_types[i]];
     }
 
     return 0;
@@ -1775,7 +1774,8 @@ VkImageTensor GPUDevice::get_dummy_image() const
 class VkDummyAllocator : public VkBlobAllocator
 {
 public:
-    VkDummyAllocator(const GPUDevice* _vkdev) : VkBlobAllocator(_vkdev)
+    VkDummyAllocator(const GPUDevice* _vkdev)
+        : VkBlobAllocator(_vkdev)
     {
         // NOTE 16k is large enough I think ...
         block_size = alignSize(16 * 1024, buffer_offset_alignment);
@@ -1785,11 +1785,14 @@ class VkDummyAllocator : public VkBlobAllocator
 class VkDummyCompute : public VkCompute
 {
 public:
-    VkDummyCompute(const GPUDevice* _vkdev) : VkCompute(_vkdev) {}
+    VkDummyCompute(const GPUDevice* _vkdev)
+        : VkCompute(_vkdev)
+    {
+    }
 
     void record_dummy(const VkTensor& buffer)
     {
-//         TLOG_INFO("xxx barrier buffer %p +%d ~%d", buffer.buffer(), buffer.buffer_offset(), buffer.buffer_capacity());
+        //         TLOG_INFO("xxx barrier buffer %p +%d ~%d", buffer.buffer(), buffer.buffer_offset(), buffer.buffer_capacity());
 
         // barrier device any @ compute/null to shader-readwrite @ compute
         VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
@@ -1830,7 +1833,7 @@ class VkDummyCompute : public VkCompute
 
     void record_dummy(const VkImageTensor& image)
     {
-//         TLOG_INFO("xxx barrier image %p +%d ~%d %p", image.image(), image.data->bind_offset, image.data->bind_capacity, image.imageview());
+        //         TLOG_INFO("xxx barrier image %p +%d ~%d %p", image.image(), image.data->bind_offset, image.data->bind_capacity, image.imageview());
 
         // image layout transform any @ any to shader-write @ compute
         VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
@@ -1874,7 +1877,6 @@ class VkDummyCompute : public VkCompute
         image.data->image_layout = VK_IMAGE_LAYOUT_GENERAL;
         image.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
     }
-
 };
 
 int GPUDevice::create_dummy_buffer_image()
@@ -1911,56 +1913,57 @@ int GPUDevice::create_utility_operator()
 
     // from buffer | image
     // to buffer | image
-    for (int i0=0; i0<2; i0++)
-    {
-    for (int i1=0; i1<2; i1++)
+    for (int i0 = 0; i0 < 2; i0++)
     {
-        opt.use_image_storage = (i0 == 1 || i1 == 1);
-// #if __APPLE__
-//         if (opt.use_image_storage)
-//             continue;
-// #endif
-
-        // from fp32-b/i | fp16p-b/i | fp16s-b/i
-        // to fp32-b/i | fp16p-b/i | fp16s-b/i
-        for (int j0=0; j0<3; j0++)
-        {
-        for (int j1=0; j1<3; j1++)
+        for (int i1 = 0; i1 < 2; i1++)
         {
-            opt.use_fp16_packed = (j0 == 1 || j1 == 1);
-            opt.use_fp16_storage = (j0 == 2 || j1 == 2);
-
-            if (!info.support_fp16_packed && opt.use_fp16_packed)
-                continue;
-
-            if (!info.support_fp16_storage && opt.use_fp16_storage)
-                continue;
-
-            // from pack1 | pack4 | pack8
-            for (int k=0; k<3; k++)
+            opt.use_image_storage = (i0 == 1 || i1 == 1);
+            // #if __APPLE__
+            //         if (opt.use_image_storage)
+            //             continue;
+            // #endif
+
+            // from fp32-b/i | fp16p-b/i | fp16s-b/i
+            // to fp32-b/i | fp16p-b/i | fp16s-b/i
+            for (int j0 = 0; j0 < 3; j0++)
             {
-                // enable pack8 for pack8to1/pack8to4
-                opt.use_shader_pack8 = true;
-
-                {   // create packing layer
-                    TEngine::Packing_vulkan* uop = new Packing_vulkan();
-                    uop->vkdev = this;
-
-                    uop->out_elempack = k == 0 ? 1 : k == 1 ? 4 : 8;
-                    uop->cast_type_from = j0 + 1;
-                    uop->cast_type_to = j1 + 1;
-                    uop->storage_type_from = i0;
-                    uop->storage_type_to = i1;
-                    // TLOG_INFO("out_elempack:%d %d %d %d %d\n", uop->out_elempack, uop->cast_type_from, uop->cast_type_to, uop->storage_type_from, uop->storage_type_to);
-
-                    uop->create_pipeline(opt);
-
-                    uop_packing[i0][i1][j0][j1][k] = uop;
+                for (int j1 = 0; j1 < 3; j1++)
+                {
+                    opt.use_fp16_packed = (j0 == 1 || j1 == 1);
+                    opt.use_fp16_storage = (j0 == 2 || j1 == 2);
+
+                    if (!info.support_fp16_packed && opt.use_fp16_packed)
+                        continue;
+
+                    if (!info.support_fp16_storage && opt.use_fp16_storage)
+                        continue;
+
+                    // from pack1 | pack4 | pack8
+                    for (int k = 0; k < 3; k++)
+                    {
+                        // enable pack8 for pack8to1/pack8to4
+                        opt.use_shader_pack8 = true;
+
+                        { // create packing layer
+                            TEngine::Packing_vulkan* uop = new Packing_vulkan();
+                            uop->vkdev = this;
+
+                            uop->out_elempack = k == 0 ? 1 : k == 1 ? 4
+                                                                    : 8;
+                            uop->cast_type_from = j0 + 1;
+                            uop->cast_type_to = j1 + 1;
+                            uop->storage_type_from = i0;
+                            uop->storage_type_to = i1;
+                            // TLOG_INFO("out_elempack:%d %d %d %d %d\n", uop->out_elempack, uop->cast_type_from, uop->cast_type_to, uop->storage_type_from, uop->storage_type_to);
+
+                            uop->create_pipeline(opt);
+
+                            uop_packing[i0][i1][j0][j1][k] = uop;
+                        }
+                    }
                 }
             }
         }
-        }
-    }
     }
 
     return 0;
@@ -1972,47 +1975,47 @@ void GPUDevice::destroy_utility_operator()
 
     // from buffer | image
     // to buffer | image
-    for (int i0=0; i0<2; i0++)
+    for (int i0 = 0; i0 < 2; i0++)
     {
-    for (int i1=0; i1<2; i1++)
-    {
-        opt.use_image_storage = (i0 == 1 || i1 == 1);
+        for (int i1 = 0; i1 < 2; i1++)
+        {
+            opt.use_image_storage = (i0 == 1 || i1 == 1);
 #if __APPLE__
-        if (opt.use_image_storage)
-            continue;
+            if (opt.use_image_storage)
+                continue;
 #endif
 
-        // from fp32-b/i | fp16p-b/i | fp16s-b/i
-        // to fp32-b/i | fp16p-b/i | fp16s-b/i
-        for (int j0=0; j0<3; j0++)
-        {
-        for (int j1=0; j1<3; j1++)
-        {
-            opt.use_fp16_packed = (j0 == 1 || j1 == 1);
-            opt.use_fp16_storage = (j0 == 2 || j1 == 2);
+            // from fp32-b/i | fp16p-b/i | fp16s-b/i
+            // to fp32-b/i | fp16p-b/i | fp16s-b/i
+            for (int j0 = 0; j0 < 3; j0++)
+            {
+                for (int j1 = 0; j1 < 3; j1++)
+                {
+                    opt.use_fp16_packed = (j0 == 1 || j1 == 1);
+                    opt.use_fp16_storage = (j0 == 2 || j1 == 2);
 
-            if (!info.support_fp16_packed && opt.use_fp16_packed)
-                continue;
+                    if (!info.support_fp16_packed && opt.use_fp16_packed)
+                        continue;
 
-            if (!info.support_fp16_storage && opt.use_fp16_storage)
-                continue;
+                    if (!info.support_fp16_storage && opt.use_fp16_storage)
+                        continue;
 
-            // from pack1 | pack4 | pack8
-            for (int k=0; k<3; k++)
-            {
-                opt.use_shader_pack8 = (k == 2 || k == 2);
+                    // from pack1 | pack4 | pack8
+                    for (int k = 0; k < 3; k++)
+                    {
+                        opt.use_shader_pack8 = (k == 2 || k == 2);
 
-                TEngine::Layer* uop = uop_packing[i0][i1][j0][j1][k];
+                        TEngine::Layer* uop = uop_packing[i0][i1][j0][j1][k];
 
-                uop->destroy_pipeline(opt);
+                        uop->destroy_pipeline(opt);
 
-                delete uop;
+                        delete uop;
 
-                uop_packing[i0][i1][j0][j1][k] = 0;
+                        uop_packing[i0][i1][j0][j1][k] = 0;
+                    }
+                }
             }
         }
-        }
-    }
     }
 }
 
@@ -2022,9 +2025,12 @@ void GPUDevice::convert_packing(const VkTensor& src, VkTensor& dst, int dst_elem
     Option opt = _opt;
     opt.use_image_storage = false;
 
-    int cast_type_from_index = src.elemsize == src.elempack * 4u ? 0 : opt.use_fp16_storage ? 2 : 1;
-    int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed && dst_elempack % 4 == 0 ? 1 : 0;
-    int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
+    int cast_type_from_index = src.elemsize == src.elempack * 4u ? 0 : opt.use_fp16_storage ? 2
+                                                                                            : 1;
+    int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed && dst_elempack % 4 == 0 ? 1
+                                                                                                     : 0;
+    int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1
+                                                                          : 2;
 
     // TLOG_INFO("convert_packing b2b %d %d %d\n", cast_type_from_index, cast_type_to_index, packing_type_to_index);
 
diff --git a/source/device/vulkan/vulkan_gpu.hpp b/source/device/vulkan/vulkan_gpu.hpp
index b0a6466a1..b5cce6eac 100644
--- a/source/device/vulkan/vulkan_gpu.hpp
+++ b/source/device/vulkan/vulkan_gpu.hpp
@@ -196,7 +196,10 @@ class GPUDevice
 
     const GpuInfo& info;
 
-    VkDevice vkdevice() const { return device; }
+    VkDevice vkdevice() const
+    {
+        return device;
+    }
 
     VkShaderModule get_shader_module(int shader_type_index) const;
 
@@ -294,17 +297,17 @@ class GPUDevice
     mutable std::vector<VkQueue> compute_queues;
     mutable std::vector<VkQueue> graphics_queues;
     mutable std::vector<VkQueue> transfer_queues;
-    
+
     mutable Mutex queue_lock;
 
     // default blob allocator for each queue
     mutable std::vector<VkAllocator*> blob_allocators;
-    
+
     mutable Mutex blob_allocator_lock;
 
     // default staging allocator for each queue
     mutable std::vector<VkAllocator*> staging_allocators;
-    
+
     mutable Mutex staging_allocator_lock;
 
     // dummy buffer and image
@@ -335,15 +338,24 @@ class ShaderInfo
     // 1 = storage buffer
     // 2 = storage image
     // 3 = combined image sampler
-    int binding_types[16];// 16 is large enough(maybe)
+    int binding_types[16]; // 16 is large enough(maybe)
 };
 
 const ShaderInfo& get_shader_info(int shader_type_index);
 int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
 
-union vk_specialization_type { int i; float f; uint32_t u32; };
-union vk_constant_type { int i; float f; };
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
 
-}
+} // namespace TEngine
 
 #endif // VULKAN_GPU_HPP
diff --git a/source/device/vulkan/vulkan_graph.hpp b/source/device/vulkan/vulkan_graph.hpp
index 8218f271c..700f95103 100644
--- a/source/device/vulkan/vulkan_graph.hpp
+++ b/source/device/vulkan/vulkan_graph.hpp
@@ -38,8 +38,7 @@
 #include "vulkan_option.hpp"
 #include "vulkan_layer.hpp"
 
-extern "C"
-{
+extern "C" {
 // #include "device/device.h"
 // #include "graph/subgraph.h"
 
@@ -55,19 +54,21 @@ extern "C"
 #include "utility/vector.h"
 #include "utility/log.h"
 
-
 #include "convolution_param.h"
 
 namespace TEngine {
 
 class VulkanDevice;
 
-class VulkanGraph {
-
-friend VulkanDevice;
+class VulkanGraph
+{
+    friend VulkanDevice;
 
 public:
-    const std::string& GetName(void) const {return name_;}
+    const std::string& GetName(void) const
+    {
+        return name_;
+    }
 
     VulkanGraph(const std::string& name);
     VulkanGraph(struct subgraph* graph);
@@ -82,14 +83,14 @@ friend VulkanDevice;
     bool CreatePoolingPipeline(ir_node_t* node);
 
     std::unordered_map<std::string, tensor*> tensor_map_;    // tengine lite cpu tensor list
-    std::unordered_map<std::string, Tensor> tensor_map;         // vulkan cpu tensor list
-    std::unordered_map<std::string, VkTensor> vktensor_map_;    // vulkan gpu tensor list
+    std::unordered_map<std::string, Tensor> tensor_map;      // vulkan cpu tensor list
+    std::unordered_map<std::string, VkTensor> vktensor_map_; // vulkan gpu tensor list
 
     bool OpSupported(const std::string& name);
 
     Option opt;
     Pipeline* pipeline_convolution;
-    
+
     int record_graph_pipeline();
 
     int upload_model();
@@ -106,23 +107,21 @@ friend VulkanDevice;
 
     VkAllocator* weight_vkallocator;
     VkAllocator* weight_staging_vkallocator;
-    
-private:
 
+private:
     VkAllocator* local_blob_vkallocator;
     VkAllocator* local_staging_vkallocator;
-    
+
     std::string name_;
 
-    std::vector<void *> gpu_mem_vector_;
-    std::vector<void *> mem_buf_vector_;
+    std::vector<void*> gpu_mem_vector_;
+    std::vector<void*> mem_buf_vector_;
 
     std::map<std::string, tensor*> iotensor_map_;
 };
 
 } //namespace TEngine
 
-
 int vulkan_dev_init(struct device* dev);
 int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options);
 int vulkan_dev_run(struct device* dev, struct subgraph* subgraph);
@@ -130,7 +129,6 @@ int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph);
 int vulkan_dev_release(struct device* dev);
 }
 
-
 /*
 
 
diff --git a/source/device/vulkan/vulkan_helper.hpp b/source/device/vulkan/vulkan_helper.hpp
index 3955be7bb..a273a3b25 100644
--- a/source/device/vulkan/vulkan_helper.hpp
+++ b/source/device/vulkan/vulkan_helper.hpp
@@ -32,8 +32,7 @@
 #include <string>
 #include <fstream>
 
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "graph/tensor.h"
 #include "graph/node.h"
@@ -49,7 +48,7 @@ extern "C"
 // bool CHECK_ENQUEUE_BUFFER_STATUS(cl_int status);
 
 /** convert the kernel file into a string */
-int convertToString(const char *filename, std::string& s);
+int convertToString(const char* filename, std::string& s);
 
 /**Getting platforms and choose an available one.*/
 // int getPlatform(cl_platform_id &platform);
@@ -60,4 +59,3 @@ int convertToString(const char *filename, std::string& s);
 void get_device_message();
 
 void dump_sub_graph(struct subgraph* sub_graph);
-
diff --git a/source/device/vulkan/vulkan_layer.cpp b/source/device/vulkan/vulkan_layer.cpp
index a4c7e4dab..84f2b9de2 100644
--- a/source/device/vulkan/vulkan_layer.cpp
+++ b/source/device/vulkan/vulkan_layer.cpp
@@ -81,4 +81,4 @@ int Layer::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vecto
     return 0;
 }
 
-} // TEngine
\ No newline at end of file
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/vulkan_layer.hpp b/source/device/vulkan/vulkan_layer.hpp
index 526ca148b..2c2be9710 100644
--- a/source/device/vulkan/vulkan_layer.hpp
+++ b/source/device/vulkan/vulkan_layer.hpp
@@ -44,8 +44,7 @@
 #include "vulkan_command.hpp"
 #include "vulkan_pipeline.hpp"
 
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "device/device.h"
 #include "graph/tensor.h"
@@ -114,6 +113,6 @@ class Layer
 
 Layer* create_layer(std::string type);
 
-} // TEngine
+} // namespace TEngine
 
 #endif // VULKAN_LAYER_HPP
diff --git a/source/device/vulkan/vulkan_limit.hpp b/source/device/vulkan/vulkan_limit.hpp
index 741786fae..fbb45e089 100644
--- a/source/device/vulkan/vulkan_limit.hpp
+++ b/source/device/vulkan/vulkan_limit.hpp
@@ -22,139 +22,134 @@
  * Author: hhchen@openailab.com
  */
 
-
 #pragma once
 
-extern "C"
-{
+extern "C" {
 #include "operator/op.h"
 }
 
-
 const int vulkan_supported_ops[] = {
 
-        OP_CLIP,
-        OP_CONCAT,
-        OP_CONST,
-        OP_CONV,
-        OP_DROPOUT,
-        OP_ELTWISE,
-        OP_FC,
-        OP_FLATTEN,
-        OP_INPUT,
-////        OP_PERMUTE,
-        OP_POOL,
-        OP_RELU,
-        OP_RESHAPE,
-        OP_SLICE,
-////        OP_SOFTMAX
-
-
-//        OP_BIAS,
+    OP_CLIP,
+    OP_CONCAT,
+    OP_CONST,
+    OP_CONV,
+    OP_DROPOUT,
+    OP_ELTWISE,
+    OP_FC,
+    OP_FLATTEN,
+    OP_INPUT,
+    ////        OP_PERMUTE,
+    OP_POOL,
+    OP_RELU,
+    OP_RESHAPE,
+    OP_SLICE,
+    ////        OP_SOFTMAX
 
-////        OP_ABSVAL,
-////        OP_ADD_N,
-////        OP_ARGMAX,
-////        OP_ARGMIN,
-////        OP_BATCHNORM,
-////        OP_BATCHTOSPACEND,
-////        OP_BIAS,
-////        OP_BROADMUL,
-//
-////        OP_CAST,
-////        OP_CEIL,
-////        OP_CLIP,
-////        OP_COMPARISON,
-////        OP_CONCAT,
-//        OP_CONST,
-//        OP_CONV,
-////        OP_CROP,
-////        OP_DECONV,
-////        OP_DEPTHTOSPACE,
-////        OP_DETECTION_OUTPUT,
-////        OP_DETECTION_POSTPROCESS,
-//
-////        OP_DROPOUT,
-////        OP_ELTWISE,
-////        OP_ELU,
-////        OP_EMBEDDING,
-////        OP_EXPANDDIMS,
-////        OP_FC,
-////        OP_FLATTEN,
-////        OP_GATHER,
-////        OP_GEMM,
-////        OP_GRU,
-////        OP_HARDSIGMOID,
-////        OP_HARDSWISH,
-//        OP_INPUT,
-////        OP_INSTANCENORM,
-////        OP_INTERP,
-////        OP_LOGICAL,
-////        OP_LOGISTIC,
-////        OP_LRN,
-////        OP_LSTM,
-////        OP_MATMUL,
-////        OP_MAXIMUM,
-////        OP_MEAN,
-////        OP_MINIMUM,
-////        OP_MVN,
-////        OP_NOOP,
-////        OP_NORMALIZE,
-//
-////        OP_PAD,
-////        OP_PERMUTE,
-//        OP_POOL,
-////        OP_PRELU,
-////        OP_PRIORBOX,
-////        OP_PSROIPOOLING,
-////        OP_REDUCEL2,
-////        OP_REDUCTION,
-////        OP_REGION,
-//        OP_RELU,
-//
-////        OP_RELU6,
-////        OP_REORG,
-////        OP_RESHAPE,
-////        OP_RESIZE,
-////        OP_REVERSE,
-////        OP_RNN,
-////        OP_ROIALIGN,
-////        OP_ROIPOOLING,
-////        OP_ROUND,
-////        OP_RPN,
-////        OP_SCALE,
-////        OP_SELU,
-////        OP_SHUFFLECHANNEL,
-////        OP_SIGMOID,
-//
-////        OP_SLICE,
-////        OP_SOFTMAX,
-////        OP_SPACETOBATCHND,
-////        OP_SPACETODEPTH,
-////        OP_SPARSETODENSE,
-////        OP_SPLIT,
-////        OP_SQUAREDDIFFERENCE,
-////        OP_SQUEEZE,
-////        OP_STRIDED_SLICE,
-////        OP_SWAP_AXIS,
-////        OP_TANH,
-////        OP_THRESHOLD,
-////        OP_TOPKV2,
-////        OP_TRANSPOSE,
-////        OP_UNARY,
-////        OP_UNSQUEEZE,
-////        OP_UPSAMPLE,
-////        OP_ZEROSLIKE,
-////        OP_MISH,
-////        OP_LOGSOFTMAX,
-////        OP_RELU1,
-////        OP_L2NORMALIZATION,
-////        OP_L2POOL,
-////        OP_TILE,
-////        OP_SHAPE,
-////        OP_SCATTER,
-////        OP_WHERE,
-////        OP_BUILTIN_LAST
+    //        OP_BIAS,
 
+    ////        OP_ABSVAL,
+    ////        OP_ADD_N,
+    ////        OP_ARGMAX,
+    ////        OP_ARGMIN,
+    ////        OP_BATCHNORM,
+    ////        OP_BATCHTOSPACEND,
+    ////        OP_BIAS,
+    ////        OP_BROADMUL,
+    //
+    ////        OP_CAST,
+    ////        OP_CEIL,
+    ////        OP_CLIP,
+    ////        OP_COMPARISON,
+    ////        OP_CONCAT,
+    //        OP_CONST,
+    //        OP_CONV,
+    ////        OP_CROP,
+    ////        OP_DECONV,
+    ////        OP_DEPTHTOSPACE,
+    ////        OP_DETECTION_OUTPUT,
+    ////        OP_DETECTION_POSTPROCESS,
+    //
+    ////        OP_DROPOUT,
+    ////        OP_ELTWISE,
+    ////        OP_ELU,
+    ////        OP_EMBEDDING,
+    ////        OP_EXPANDDIMS,
+    ////        OP_FC,
+    ////        OP_FLATTEN,
+    ////        OP_GATHER,
+    ////        OP_GEMM,
+    ////        OP_GRU,
+    ////        OP_HARDSIGMOID,
+    ////        OP_HARDSWISH,
+    //        OP_INPUT,
+    ////        OP_INSTANCENORM,
+    ////        OP_INTERP,
+    ////        OP_LOGICAL,
+    ////        OP_LOGISTIC,
+    ////        OP_LRN,
+    ////        OP_LSTM,
+    ////        OP_MATMUL,
+    ////        OP_MAXIMUM,
+    ////        OP_MEAN,
+    ////        OP_MINIMUM,
+    ////        OP_MVN,
+    ////        OP_NOOP,
+    ////        OP_NORMALIZE,
+    //
+    ////        OP_PAD,
+    ////        OP_PERMUTE,
+    //        OP_POOL,
+    ////        OP_PRELU,
+    ////        OP_PRIORBOX,
+    ////        OP_PSROIPOOLING,
+    ////        OP_REDUCEL2,
+    ////        OP_REDUCTION,
+    ////        OP_REGION,
+    //        OP_RELU,
+    //
+    ////        OP_RELU6,
+    ////        OP_REORG,
+    ////        OP_RESHAPE,
+    ////        OP_RESIZE,
+    ////        OP_REVERSE,
+    ////        OP_RNN,
+    ////        OP_ROIALIGN,
+    ////        OP_ROIPOOLING,
+    ////        OP_ROUND,
+    ////        OP_RPN,
+    ////        OP_SCALE,
+    ////        OP_SELU,
+    ////        OP_SHUFFLECHANNEL,
+    ////        OP_SIGMOID,
+    //
+    ////        OP_SLICE,
+    ////        OP_SOFTMAX,
+    ////        OP_SPACETOBATCHND,
+    ////        OP_SPACETODEPTH,
+    ////        OP_SPARSETODENSE,
+    ////        OP_SPLIT,
+    ////        OP_SQUAREDDIFFERENCE,
+    ////        OP_SQUEEZE,
+    ////        OP_STRIDED_SLICE,
+    ////        OP_SWAP_AXIS,
+    ////        OP_TANH,
+    ////        OP_THRESHOLD,
+    ////        OP_TOPKV2,
+    ////        OP_TRANSPOSE,
+    ////        OP_UNARY,
+    ////        OP_UNSQUEEZE,
+    ////        OP_UPSAMPLE,
+    ////        OP_ZEROSLIKE,
+    ////        OP_MISH,
+    ////        OP_LOGSOFTMAX,
+    ////        OP_RELU1,
+    ////        OP_L2NORMALIZATION,
+    ////        OP_L2POOL,
+    ////        OP_TILE,
+    ////        OP_SHAPE,
+    ////        OP_SCATTER,
+    ////        OP_WHERE,
+    ////        OP_BUILTIN_LAST
 
 };
diff --git a/source/device/vulkan/vulkan_option.cpp b/source/device/vulkan/vulkan_option.cpp
index d57440411..e61d37a13 100644
--- a/source/device/vulkan/vulkan_option.cpp
+++ b/source/device/vulkan/vulkan_option.cpp
@@ -58,7 +58,7 @@ Option::Option()
     use_int8_inference = true;
     use_vulkan_compute = true;
 
-    use_fp16_packed = true; 
+    use_fp16_packed = true;
     use_fp16_storage = true;
     use_fp16_arithmetic = false;
     use_int8_storage = false;
diff --git a/source/device/vulkan/vulkan_pipeline.cpp b/source/device/vulkan/vulkan_pipeline.cpp
index 6935c76b5..d604db1f6 100644
--- a/source/device/vulkan/vulkan_pipeline.cpp
+++ b/source/device/vulkan/vulkan_pipeline.cpp
@@ -46,7 +46,8 @@
 
 namespace TEngine {
 
-Pipeline::Pipeline(const GPUDevice* _vkdev) : vkdev(_vkdev)
+Pipeline::Pipeline(const GPUDevice* _vkdev)
+    : vkdev(_vkdev)
 {
     local_shader_module = 0;
 
@@ -92,7 +93,7 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::
         local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
     }
 
-//     TLOG_INFO("local_shader_module %p created", local_shader_module);
+    //     TLOG_INFO("local_shader_module %p created", local_shader_module);
 
     return create(local_shader_module, si, specializations);
 }
@@ -198,12 +199,12 @@ void Pipeline::destroy()
         {
             vkdev->vkDestroyDescriptorUpdateTemplateKHR(vkdev->vkdevice(), descriptor_update_template, 0);
             descriptor_update_template = 0;
-        }	
+        }
     }
 
     if (pipeline)
     {
-	vkDestroyPipeline(vkdev->vkdevice(), pipeline, 0);
+        vkDestroyPipeline(vkdev->vkdevice(), pipeline, 0);
         pipeline = 0;
     }
 
@@ -307,7 +308,7 @@ void Pipeline::set_local_size_xyz(int w, int h, int c)
     local_size_y = h;
     local_size_z = c;
 
-//     TLOG_INFO("local size = %d %d %d", local_size_x, local_size_y, local_size_z);
+    //     TLOG_INFO("local size = %d %d %d", local_size_x, local_size_y, local_size_z);
 }
 
 int Pipeline::create_descriptorset_layout()
@@ -321,7 +322,7 @@ int Pipeline::create_descriptorset_layout()
     }
 
     std::vector<VkDescriptorSetLayoutBinding> descriptorSetLayoutBindings(binding_count);
-    for (int i=0; i<binding_count; i++)
+    for (int i = 0; i < binding_count; i++)
     {
         int binding_type = shader_info.binding_types[i];
 
@@ -388,24 +389,24 @@ int Pipeline::create_pipeline_layout()
 
     if (descriptorset_layout)
     {
-    pipelineLayoutCreateInfo.setLayoutCount = 1;
-    pipelineLayoutCreateInfo.pSetLayouts = &descriptorset_layout;
+        pipelineLayoutCreateInfo.setLayoutCount = 1;
+        pipelineLayoutCreateInfo.pSetLayouts = &descriptorset_layout;
     }
     else
     {
-    pipelineLayoutCreateInfo.setLayoutCount = 0;
-    pipelineLayoutCreateInfo.pSetLayouts = 0;
+        pipelineLayoutCreateInfo.setLayoutCount = 0;
+        pipelineLayoutCreateInfo.pSetLayouts = 0;
     }
 
     if (push_constant_count > 0)
     {
-    pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
-    pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
+        pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
+        pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
     }
     else
     {
-    pipelineLayoutCreateInfo.pushConstantRangeCount = 0;
-    pipelineLayoutCreateInfo.pPushConstantRanges = 0;
+        pipelineLayoutCreateInfo.pushConstantRangeCount = 0;
+        pipelineLayoutCreateInfo.pPushConstantRanges = 0;
     }
 
     VkResult ret = vkCreatePipelineLayout(vkdev->vkdevice(), &pipelineLayoutCreateInfo, 0, &pipeline_layout);
@@ -418,7 +419,6 @@ int Pipeline::create_pipeline_layout()
     return 0;
 }
 
-
 int Pipeline::create_pipeline(VkShaderModule shader_module, const std::vector<vk_specialization_type>& specializations)
 {
     const int specialization_count = specializations.size();
@@ -427,7 +427,7 @@ int Pipeline::create_pipeline(VkShaderModule shader_module, const std::vector<vk
     std::vector<VkSpecializationMapEntry> specializationMapEntries;
     specializationMapEntries.resize(specialization_count + 3);
 
-    for (int i=0; i<specialization_count; i++)
+    for (int i = 0; i < specialization_count; i++)
     {
         specializationMapEntries[i].constantID = i;
         specializationMapEntries[i].offset = i * sizeof(vk_specialization_type);
@@ -442,21 +442,21 @@ int Pipeline::create_pipeline(VkShaderModule shader_module, const std::vector<vk
         VkSpecializationMapEntry* local_size_xyz_entries = specializationMapEntries.data() + specialization_count;
 
         local_size_xyz_entries[0].constantID = 233;
-        local_size_xyz_entries[0].offset = (specialization_count+0) * sizeof(vk_specialization_type);
+        local_size_xyz_entries[0].offset = (specialization_count + 0) * sizeof(vk_specialization_type);
         local_size_xyz_entries[0].size = sizeof(vk_specialization_type);
 
         local_size_xyz_entries[1].constantID = 234;
-        local_size_xyz_entries[1].offset = (specialization_count+1) * sizeof(vk_specialization_type);
+        local_size_xyz_entries[1].offset = (specialization_count + 1) * sizeof(vk_specialization_type);
         local_size_xyz_entries[1].size = sizeof(vk_specialization_type);
 
         local_size_xyz_entries[2].constantID = 235;
-        local_size_xyz_entries[2].offset = (specialization_count+2) * sizeof(vk_specialization_type);
+        local_size_xyz_entries[2].offset = (specialization_count + 2) * sizeof(vk_specialization_type);
         local_size_xyz_entries[2].size = sizeof(vk_specialization_type);
 
         specialization_data.resize(specialization_count + 3);
-        specialization_data[ specialization_count+0 ].u32 = local_size_x;
-        specialization_data[ specialization_count+1 ].u32 = local_size_y;
-        specialization_data[ specialization_count+2 ].u32 = local_size_z;
+        specialization_data[specialization_count + 0].u32 = local_size_x;
+        specialization_data[specialization_count + 1].u32 = local_size_y;
+        specialization_data[specialization_count + 2].u32 = local_size_z;
     }
 
     VkSpecializationInfo specializationInfo;
@@ -505,7 +505,7 @@ int Pipeline::create_descriptor_update_template()
 
     std::vector<VkDescriptorUpdateTemplateEntryKHR> descriptorUpdateTemplateEntries(binding_count);
     size_t offset = 0;
-    for (int i=0; i<binding_count; i++)// TODO do not update weights
+    for (int i = 0; i < binding_count; i++) // TODO do not update weights
     {
         int binding_type = shader_info.binding_types[i];
 
@@ -537,15 +537,15 @@ int Pipeline::create_descriptor_update_template()
     descriptorUpdateTemplateCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR;
     descriptorUpdateTemplateCreateInfo.pNext = 0;
     descriptorUpdateTemplateCreateInfo.flags = 0;
-    descriptorUpdateTemplateCreateInfo.descriptorUpdateEntryCount = binding_count;// TODO do not update weights
+    descriptorUpdateTemplateCreateInfo.descriptorUpdateEntryCount = binding_count; // TODO do not update weights
     descriptorUpdateTemplateCreateInfo.pDescriptorUpdateEntries = descriptorUpdateTemplateEntries.data();
     if (vkdev->info.support_VK_KHR_push_descriptor)
     {
-    descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR;
+        descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR;
     }
     else
     {
-    descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
+        descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
     }
     // descriptorSetLayout should be ignored if VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR
     // FIXME HACK WARNING TODO NOTE but crash on radv if set NULL  :(
@@ -564,5 +564,4 @@ int Pipeline::create_descriptor_update_template()
     return 0;
 }
 
-
 } // namespace TEngine
diff --git a/source/device/vulkan/vulkan_pipeline.hpp b/source/device/vulkan/vulkan_pipeline.hpp
index 9980d2e43..a2c349901 100644
--- a/source/device/vulkan/vulkan_pipeline.hpp
+++ b/source/device/vulkan/vulkan_pipeline.hpp
@@ -57,7 +57,7 @@ class Pipeline
 
 public:
     void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
-    
+
     void set_optimal_local_size_xyz(const VkTensor& local_size_xyz);
     void set_optimal_local_size_xyz(const Tensor& local_size_xyz);
     void set_local_size_xyz(int w, int h, int c);
diff --git a/source/device/vulkan/vulkan_platform.hpp b/source/device/vulkan/vulkan_platform.hpp
index cc03681a7..97f588246 100644
--- a/source/device/vulkan/vulkan_platform.hpp
+++ b/source/device/vulkan/vulkan_platform.hpp
@@ -47,10 +47,23 @@ namespace TEngine {
 class Mutex
 {
 public:
-    Mutex() { pthread_mutex_init(&mutex, 0); }
-    ~Mutex() { pthread_mutex_destroy(&mutex); }
-    void lock() { pthread_mutex_lock(&mutex); }
-    void unlock() { pthread_mutex_unlock(&mutex); }
+    Mutex()
+    {
+        pthread_mutex_init(&mutex, 0);
+    }
+    ~Mutex()
+    {
+        pthread_mutex_destroy(&mutex);
+    }
+    void lock()
+    {
+        pthread_mutex_lock(&mutex);
+    }
+    void unlock()
+    {
+        pthread_mutex_unlock(&mutex);
+    }
+
 private:
     friend class ConditionVariable;
     pthread_mutex_t mutex;
@@ -59,8 +72,16 @@ class Mutex
 class MutexLockGuard
 {
 public:
-    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
-    ~MutexLockGuard() { mutex.unlock(); }
+    MutexLockGuard(Mutex& _mutex)
+        : mutex(_mutex)
+    {
+        mutex.lock();
+    }
+    ~MutexLockGuard()
+    {
+        mutex.unlock();
+    }
+
 private:
     Mutex& mutex;
 };
@@ -68,11 +89,27 @@ class MutexLockGuard
 class ConditionVariable
 {
 public:
-    ConditionVariable() { pthread_cond_init(&cond, 0); }
-    ~ConditionVariable() { pthread_cond_destroy(&cond); }
-    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
-    void broadcast() { pthread_cond_broadcast(&cond); }
-    void signal() { pthread_cond_signal(&cond); }
+    ConditionVariable()
+    {
+        pthread_cond_init(&cond, 0);
+    }
+    ~ConditionVariable()
+    {
+        pthread_cond_destroy(&cond);
+    }
+    void wait(Mutex& mutex)
+    {
+        pthread_cond_wait(&cond, &mutex.mutex);
+    }
+    void broadcast()
+    {
+        pthread_cond_broadcast(&cond);
+    }
+    void signal()
+    {
+        pthread_cond_signal(&cond);
+    }
+
 private:
     pthread_cond_t cond;
 };
@@ -80,9 +117,18 @@ class ConditionVariable
 class Thread
 {
 public:
-    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
-    ~Thread() {}
-    void join() { pthread_join(t, 0); }
+    Thread(void* (*start)(void*), void* args = 0)
+    {
+        pthread_create(&t, 0, start, args);
+    }
+    ~Thread()
+    {
+    }
+    void join()
+    {
+        pthread_join(t, 0);
+    }
+
 private:
     pthread_t t;
 };
diff --git a/source/device/vulkan/vulkan_tensor.cpp b/source/device/vulkan/vulkan_tensor.cpp
index 38f588502..8beff0cc8 100644
--- a/source/device/vulkan/vulkan_tensor.cpp
+++ b/source/device/vulkan/vulkan_tensor.cpp
@@ -98,7 +98,7 @@ void convert_packing(const Tensor& src, Tensor& dst, int _elempack, const Option
         if (dst.empty())
             return;
 
-        #pragma omp parallel for
+#pragma omp parallel for
         for (int i = 0; i < outh; i++)
         {
             unsigned char* outptr = (unsigned char*)dst + i * w * out_elemsize;
@@ -135,7 +135,7 @@ void convert_packing(const Tensor& src, Tensor& dst, int _elempack, const Option
         if (dst.empty())
             return;
 
-        #pragma omp parallel for
+#pragma omp parallel for
         for (int q = 0; q < outc; q++)
         {
             Tensor out = dst.channel(q);
@@ -309,11 +309,11 @@ void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt)
         dst.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
     }
     if (dst.empty())
-        return ;
+        return;
 
     int size = w * h * elempack;
 
-    #pragma omp parallel for 
+#pragma omp parallel for
     for (int q = 0; q < channels; q++)
     {
         const float* ptr = src.channel(q);
@@ -324,7 +324,6 @@ void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt)
             outptr[i] = float32_to_float16(ptr[i]);
         }
     }
-
 }
 
 void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt)
@@ -353,11 +352,11 @@ void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt)
         dst.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
     }
     if (dst.empty())
-        return ;
+        return;
 
     int size = w * h * elempack;
 
-    #pragma omp parallel for
+#pragma omp parallel for
     for (int q = 0; q < channels; q++)
     {
         const unsigned short* ptr = src.channel(q);
@@ -368,7 +367,6 @@ void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt)
             outptr[i] = float16_to_float32(ptr[i]);
         }
     }
-
 }
 
-}   // namespace TEngine
+} // namespace TEngine
diff --git a/source/device/vulkan/vulkan_tensor.hpp b/source/device/vulkan/vulkan_tensor.hpp
index a0ef5a9bd..f10868c8c 100644
--- a/source/device/vulkan/vulkan_tensor.hpp
+++ b/source/device/vulkan/vulkan_tensor.hpp
@@ -44,8 +44,7 @@
 #include <cstring>
 // #include "tengine_ir.h"
 
-extern "C"
-{
+extern "C" {
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
@@ -169,8 +168,10 @@ class Tensor
     const float* row(int y) const;
 
     // access raw data
-    template<typename T> operator T*();
-    template<typename T> operator const T*() const;
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
 
     // pointer to the data
     void* data;
@@ -205,8 +206,6 @@ class Tensor
     size_t cstep;
 };
 
-
-
 class VkTensor
 {
 public:
@@ -242,7 +241,7 @@ class VkTensor
     ~VkTensor();
     // assign
     VkTensor& operator=(const VkTensor& m);
-        // reshape vec
+    // reshape vec
     VkTensor reshape(int w, Allocator* allocator = 0) const;
     // reshape image
     VkTensor reshape(int w, int h, Allocator* allocator = 0) const;
@@ -290,7 +289,7 @@ class VkTensor
 
     // shape only
     // Mat shape() const;
-    
+
     // low-level reference
     VkBuffer buffer() const;
     size_t buffer_offset() const;
@@ -388,7 +387,6 @@ class VkImageTensor
     // allocate like
     void create_like(const VkImageTensor& im, VkAllocator* allocator);
 
-
     // mapped
     ///Mat mapped() const;
     void* mapped_ptr() const;
@@ -418,7 +416,7 @@ class VkImageTensor
 
     // pointer to the reference counter
     // when points to user-allocated data, the pointer is NULL
-    
+
     int* refcount;
 
     // element size in bytes
@@ -1139,7 +1137,6 @@ inline void VkImageTensor::create_like(const tensor* m, VkAllocator* _allocator)
         create(_w, _h, _c, _elemsize, _elempack, _allocator);
 }
 
-
 inline void VkImageTensor::create_like(const VkTensor& m, VkAllocator* _allocator)
 {
     int _dims = m.dims;
@@ -1248,23 +1245,25 @@ inline VkImageView VkImageTensor::imageview() const
     return data->imageview;
 }
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //Tensor defination
 
 inline Tensor::Tensor()
     : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
-}   
+}
 
 inline Tensor::Tensor(int _w, size_t _elemsize, Allocator* _allocator)
     : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
     create(_w, _elemsize, _allocator);
-}   
+}
 
-inline Tensor::Tensor(int _w, int _h, size_t _elemsize, Allocator* _allocator)     : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0){
-    create(_w, _h, _elemsize, _allocator);}
+inline Tensor::Tensor(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
 inline Tensor::Tensor(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
     : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
@@ -1299,7 +1298,7 @@ inline Tensor::Tensor(const Tensor& m)
 inline Tensor::Tensor(struct tensor* m)
     : data(m->data), refcount(0), elemsize(0), elempack(1), allocator(0), dims(0), w(0), h(0), c(0)
 {
-    if(m->layout == 0)
+    if (m->layout == 0)
     {
         c = m->dims[1];
         h = m->dims[2];
@@ -1398,7 +1397,7 @@ inline Tensor Tensor::reshape(int _w, Allocator* _allocator) const
         m.create(_w, elemsize, elempack, _allocator);
 
         // flatten
-        for (int i=0; i<c; i++)
+        for (int i = 0; i < c; i++)
         {
             const void* ptr = (unsigned char*)data + i * cstep * elemsize;
             void* mptr = (unsigned char*)m.data + i * w * h * elemsize;
@@ -1431,7 +1430,7 @@ inline Tensor Tensor::reshape(int _w, int _h, Allocator* _allocator) const
         m.create(_w, _h, elemsize, elempack, _allocator);
 
         // flatten
-        for (int i=0; i<c; i++)
+        for (int i = 0; i < c; i++)
         {
             const void* ptr = (unsigned char*)data + i * cstep * elemsize;
             void* mptr = (unsigned char*)m.data + i * w * h * elemsize;
@@ -1466,7 +1465,7 @@ inline Tensor Tensor::reshape(int _w, int _h, int _c, Allocator* _allocator) con
             m.create(_w, _h, _c, elemsize, elempack, _allocator);
 
             // align channel
-            for (int i=0; i<_c; i++)
+            for (int i = 0; i < _c; i++)
             {
                 const void* ptr = (unsigned char*)data + i * _w * _h * elemsize;
                 void* mptr = (unsigned char*)m.data + i * m.cstep * m.elemsize;
@@ -1661,7 +1660,7 @@ inline void Tensor::create(int _w, int _h, int _c, size_t _elemsize, int _elempa
     h = _h;
     c = _c;
 
-    cstep = w * h;    //alignSize(w * h * elemsize, 16) / elemsize;
+    cstep = w * h; //alignSize(w * h * elemsize, 16) / elemsize;
 
     if (total() > 0)
     {
@@ -1792,26 +1791,23 @@ inline const float* Tensor::row(int y) const
     return (const float*)((unsigned char*)data + w * y * elemsize);
 }
 
-template <typename T>
+template<typename T>
 inline Tensor::operator T*()
 {
     return (T*)data;
 }
 
-template <typename T>
+template<typename T>
 inline Tensor::operator const T*() const
 {
     return (const T*)data;
 }
 
 void convert_packing(const Tensor& src, Tensor& dst, int elempack, const Option& opt = Option());
-void convert_packing(tensor* src, Tensor&dst, int elempack, const Option& opt = Option());
+void convert_packing(tensor* src, Tensor& dst, int elempack, const Option& opt = Option());
 void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt = Option());
 void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt = Option());
 
-
 } // namespace TEngine
 
-
 #endif // VULKAN_TENSOR_HPP
-
diff --git a/source/executer/executer.c b/source/executer/executer.c
index 28fb4e513..7350cc4c7 100644
--- a/source/executer/executer.c
+++ b/source/executer/executer.c
@@ -30,19 +30,17 @@
 
 #include <string.h>
 
-
 void init_attribute(ir_attribute_t* attribute, ir_context_t* context)
 {
-    attribute->status               = GRAPH_STAT_CREATED;
-    attribute->priority             = 0;
-    attribute->policy               = DEFAULT_POLICY;
-    attribute->private_context      = 0;
-    attribute->context              = context;
-    attribute->device_privacy       = NULL;
-    attribute->scheduler_privacy    = NULL;
+    attribute->status = GRAPH_STAT_CREATED;
+    attribute->priority = 0;
+    attribute->policy = DEFAULT_POLICY;
+    attribute->private_context = 0;
+    attribute->context = context;
+    attribute->device_privacy = NULL;
+    attribute->scheduler_privacy = NULL;
 }
 
-
 void destroy_attribute(struct graph* graph, ir_attribute_t* attribute)
 {
     if (NULL != attribute->device_privacy)
@@ -58,14 +56,12 @@ void destroy_attribute(struct graph* graph, ir_attribute_t* attribute)
     sys_free(attribute);
 }
 
-
 int release_device_mem(struct device* dev, ir_memory_t* dev_mem)
 {
     // TODO:
     return -1;
 }
 
-
 void init_ir_context(ir_context_t* context, const char* name)
 {
     if (NULL != name)
@@ -76,11 +72,11 @@ void init_ir_context(ir_context_t* context, const char* name)
     }
     else
     {
-        context->name           = NULL;
+        context->name = NULL;
     }
 
-    context->scheduler          = NULL;
-    context->device             = NULL;
-    context->default_options    = NULL;
-    context->device_options     = NULL;
+    context->scheduler = NULL;
+    context->device = NULL;
+    context->default_options = NULL;
+    context->device_options = NULL;
 }
diff --git a/source/executer/executer.h b/source/executer/executer.h
index 1d86dc2d9..fe381f4c2 100644
--- a/source/executer/executer.h
+++ b/source/executer/executer.h
@@ -32,7 +32,6 @@ struct device;
 struct graph;
 struct scheduler;
 
-
 /*!
  * @struct ir_context_t
  * @brief  Abstract neural network runnable execution context
@@ -40,42 +39,39 @@ struct scheduler;
 typedef struct context
 {
     char* name;
-    struct scheduler* scheduler;        //!< binding scheduler of this context
-    struct device*    device;           //!< binding device of this context
-    void*  default_options;             //<! default device options of this context
-    void*  device_options;              //<! device options of this context
+    struct scheduler* scheduler; //!< binding scheduler of this context
+    struct device* device;       //!< binding device of this context
+    void* default_options;       //<! default device options of this context
+    void* device_options;        //<! device options of this context
 } ir_context_t;
 
-
 /*!
  * @struct ir_memory_t
  * @brief  The memory block used in device
  */
 typedef struct memory
 {
-    uint32_t  dev_mem_size;
-    uint8_t   dev_type;
-    uint8_t   cpu_read_ready;
-    uint8_t   cpu_write_done;
+    uint32_t dev_mem_size;
+    uint8_t dev_type;
+    uint8_t cpu_read_ready;
+    uint8_t cpu_write_done;
 
-    void*     mapped_mem;
-    void*     privacy; /* opaque pointer for device to interpret the dev_mem_addr */
+    void* mapped_mem;
+    void* privacy;     /* opaque pointer for device to interpret the dev_mem_addr */
     uintptr_t address; /* why not pointer? as in 32bit CPU, the dev address may be 64bit */
 } ir_memory_t;
 
-
 typedef struct attribute
 {
     uint8_t status;
     uint8_t priority;
     uint8_t policy;
     uint8_t private_context;
-    struct  context* context;
-    void*   device_privacy;
-    void*   scheduler_privacy;
+    struct context* context;
+    void* device_privacy;
+    void* scheduler_privacy;
 } ir_attribute_t;
 
-
 /*!
  * @brief  Initialize a context.
  *
@@ -84,7 +80,6 @@ typedef struct attribute
  */
 void init_ir_context(ir_context_t* context, const char* name);
 
-
 /*!
  * @brief  Init graph attribute.
  *
@@ -93,7 +88,6 @@ void init_ir_context(ir_context_t* context, const char* name);
  */
 void init_attribute(ir_attribute_t* attribute, struct context* context);
 
-
 /*!
  * @brief  Release graph attribute.
  *
@@ -102,7 +96,6 @@ void init_attribute(ir_attribute_t* attribute, struct context* context);
  */
 void destroy_attribute(struct graph* graph, ir_attribute_t* attribute);
 
-
 /*!
  * @brief  Release device memory.
  *
diff --git a/source/graph/graph.c b/source/graph/graph.c
index 59506836e..7a886b6a1 100644
--- a/source/graph/graph.c
+++ b/source/graph/graph.c
@@ -36,10 +36,8 @@
 #include "utility/utils.h"
 #include "utility/log.h"
 
-
 #include <string.h>
 
-
 ir_graph_t* create_ir_graph(struct context* context)
 {
     ir_graph_t* ir_graph = (ir_graph_t*)sys_malloc(sizeof(ir_graph_t));
@@ -55,37 +53,35 @@ ir_graph_t* create_ir_graph(struct context* context)
     return ir_graph;
 }
 
-
 void init_ir_graph(ir_graph_t* graph, struct context* context)
 {
-    graph->tensor_list          = NULL;
-    graph->node_list            = NULL;
-    graph->input_nodes          = NULL;
-    graph->output_nodes         = NULL;
+    graph->tensor_list = NULL;
+    graph->node_list = NULL;
+    graph->input_nodes = NULL;
+    graph->output_nodes = NULL;
 
-    graph->tensor_num           = 0;
-    graph->node_num             = 0;
-    graph->input_num            = 0;
-    graph->output_num           = 0;
+    graph->tensor_num = 0;
+    graph->node_num = 0;
+    graph->input_num = 0;
+    graph->output_num = 0;
 
-    graph->subgraph_list        = create_vector(sizeof(struct subgraph*), NULL);
+    graph->subgraph_list = create_vector(sizeof(struct subgraph*), NULL);
 
-    graph->graph_layout         = TENGINE_LAYOUT_NCHW;
-    graph->model_layout         = TENGINE_LAYOUT_NCHW;
-    graph->model_format         = MODEL_FORMAT_TENGINE;
+    graph->graph_layout = TENGINE_LAYOUT_NCHW;
+    graph->model_layout = TENGINE_LAYOUT_NCHW;
+    graph->model_format = MODEL_FORMAT_TENGINE;
 
-    graph->serializer           = NULL;
-    graph->serializer_privacy   = NULL;
+    graph->serializer = NULL;
+    graph->serializer_privacy = NULL;
 
-    graph->device               = NULL;
-    graph->device_privacy       = NULL;
+    graph->device = NULL;
+    graph->device_privacy = NULL;
 
-    graph->status               = GRAPH_STAT_CREATED;
+    graph->status = GRAPH_STAT_CREATED;
 
     init_attribute(graph->attribute, context);
 }
 
-
 void destroy_ir_graph(ir_graph_t* graph)
 {
     //!< 1, destroy subgraph
@@ -134,7 +130,6 @@ void destroy_ir_graph(ir_graph_t* graph)
     sys_free(graph);
 }
 
-
 int set_ir_graph_input_node(ir_graph_t* graph, int16_t input_nodes[], int input_number)
 {
     if (0 >= input_number)
@@ -142,7 +137,7 @@ int set_ir_graph_input_node(ir_graph_t* graph, int16_t input_nodes[], int input_
         return -1;
     }
 
-    int16_t* new_input_nodes = ( int16_t* )sys_malloc(input_number * sizeof(int16_t));
+    int16_t* new_input_nodes = (int16_t*)sys_malloc(input_number * sizeof(int16_t));
     if (NULL == new_input_nodes)
     {
         return -1;
@@ -200,25 +195,21 @@ int set_ir_graph_output_node(ir_graph_t* graph, int16_t output_nodes[], int outp
     return 0;
 }
 
-
 struct tensor* get_ir_graph_tensor(ir_graph_t* graph, int index)
 {
     return graph->tensor_list[index];
 }
 
-
 struct node* get_ir_graph_node(ir_graph_t* graph, int index)
 {
     return graph->node_list[index];
 }
 
-
 struct subgraph* get_ir_graph_subgraph(ir_graph_t* graph, int index)
 {
     return *(struct subgraph**)get_vector_data(graph->subgraph_list, index);
 }
 
-
 int infer_ir_graph_shape(ir_graph_t* graph)
 {
     const int node_num = graph->node_num;
@@ -281,7 +272,6 @@ int infer_ir_graph_shape(ir_graph_t* graph)
     return 0;
 }
 
-
 void dump_ir_graph(ir_graph_t* graph)
 {
     TLOG_INFO("graph node_num %u tensor_num: %u  subgraph_num: %u\n", graph->node_num, graph->tensor_num,
diff --git a/source/graph/graph.h b/source/graph/graph.h
index e5d746138..a336ef837 100644
--- a/source/graph/graph.h
+++ b/source/graph/graph.h
@@ -33,41 +33,39 @@ struct tensor;
 struct device;
 struct attribute;
 
-
 /*!
  * @struct ir_graph_t
  * @brief  Abstract graph intermediate representation
  */
 typedef struct graph
 {
-    struct tensor** tensor_list;            //!< the tensor list of a graph
-    struct node**   node_list;              //!< the node list of a graph
-    int16_t* input_nodes;                   //!< input nodes index array of a graph
-    int16_t* output_nodes;                  //!< output nodes index array of a graph
+    struct tensor** tensor_list; //!< the tensor list of a graph
+    struct node** node_list;     //!< the node list of a graph
+    int16_t* input_nodes;        //!< input nodes index array of a graph
+    int16_t* output_nodes;       //!< output nodes index array of a graph
 
-    uint16_t tensor_num;                    //!< the count of all graph tensor
-    uint16_t node_num;                      //!< the count of all graph node
-    uint16_t input_num;                     //!< input nodes index count of a graph
-    uint16_t output_num;                    //!< input nodes index count of a graph
+    uint16_t tensor_num; //!< the count of all graph tensor
+    uint16_t node_num;   //!< the count of all graph node
+    uint16_t input_num;  //!< input nodes index count of a graph
+    uint16_t output_num; //!< input nodes index count of a graph
 
-    int8_t   graph_layout;                  //!< the data layout of a graph
-    int8_t   model_layout;                  //!< model layout of graph source model
-    int8_t   model_format;                  //!< model format of graph source model
+    int8_t graph_layout; //!< the data layout of a graph
+    int8_t model_layout; //!< model layout of graph source model
+    int8_t model_format; //!< model format of graph source model
 
-    uint8_t  status;                        //!< the status of graph
+    uint8_t status; //!< the status of graph
 
-    struct   serializer* serializer;        //!< serializer of graph
-    void*    serializer_privacy;            //!< privacy data of serializer
+    struct serializer* serializer; //!< serializer of graph
+    void* serializer_privacy;      //!< privacy data of serializer
 
-    struct   device* device;                //!< assigned nn_device for this graph
-    void*    device_privacy;                //!< privacy data of device
+    struct device* device; //!< assigned nn_device for this graph
+    void* device_privacy;  //!< privacy data of device
 
-    struct   attribute*  attribute;         //<! attribute of graph
+    struct attribute* attribute; //<! attribute of graph
 
-    struct vector* subgraph_list;           //!< subgraph list of this graph
+    struct vector* subgraph_list; //!< subgraph list of this graph
 } ir_graph_t;
 
-
 /*!
  * @brief Create a graph.
  *
@@ -77,7 +75,6 @@ typedef struct graph
  */
 struct graph* create_ir_graph(struct context* context);
 
-
 /*!
  * @brief Init a graph.
  *
@@ -86,7 +83,6 @@ struct graph* create_ir_graph(struct context* context);
  */
 void init_ir_graph(ir_graph_t* graph, struct context* context);
 
-
 /*!
  * @brief Destroy a graph.
  *
@@ -96,7 +92,6 @@ void init_ir_graph(ir_graph_t* graph, struct context* context);
  */
 void destroy_ir_graph(ir_graph_t* graph);
 
-
 /*!
  * @brief Set input nodes for specific graph.
  *
@@ -110,7 +105,6 @@ void destroy_ir_graph(ir_graph_t* graph);
  */
 int set_ir_graph_input_node(ir_graph_t* graph, int16_t input_nodes[], int input_number);
 
-
 /*!
  * @brief Set output nodes for specific graph.
  *
@@ -124,7 +118,6 @@ int set_ir_graph_input_node(ir_graph_t* graph, int16_t input_nodes[], int input_
  */
 int set_ir_graph_output_node(ir_graph_t* graph, int16_t output_nodes[], int output_number);
 
-
 /*!
  * @brief Get specific tensor for a graph.
  *
@@ -135,7 +128,6 @@ int set_ir_graph_output_node(ir_graph_t* graph, int16_t output_nodes[], int outp
  */
 struct tensor* get_ir_graph_tensor(ir_graph_t* graph, int index);
 
-
 /*!
  * @brief Get specific node for a graph.
  *
@@ -146,7 +138,6 @@ struct tensor* get_ir_graph_tensor(ir_graph_t* graph, int index);
  */
 struct node* get_ir_graph_node(ir_graph_t* graph, int index);
 
-
 /*!
  * @brief Get output subgraph for a graph.
  *
@@ -157,7 +148,6 @@ struct node* get_ir_graph_node(ir_graph_t* graph, int index);
  */
 struct subgraph* get_ir_graph_subgraph(ir_graph_t* graph, int index);
 
-
 /*!
  * @brief Infer each node shape for a graph.
  *
@@ -167,7 +157,6 @@ struct subgraph* get_ir_graph_subgraph(ir_graph_t* graph, int index);
  */
 int infer_ir_graph_shape(ir_graph_t* graph);
 
-
 /*!
  * @brief  Dump the graph.
  *
diff --git a/source/graph/node.c b/source/graph/node.c
index 3cc50e4c6..0a6266d1b 100644
--- a/source/graph/node.c
+++ b/source/graph/node.c
@@ -37,27 +37,25 @@
 
 #define TENGINE_DEFAULT_LAYOUT TENGINE_LAYOUT_NCHW
 
-
 static void init_ir_node(ir_node_t* ir_node, int op_type, int op_version, int node_index)
 {
-    ir_node->index             = node_index;
-    ir_node->dynamic_shape     = 0;
-    ir_node->input_num         = 0;
-    ir_node->output_num        = 0;
-    ir_node->node_type         = TE_NODE_TYPE_INTER;
-    ir_node->input_tensors     = NULL;
-    ir_node->output_tensors    = NULL;
-    ir_node->name              = NULL;
-    ir_node->op.type           = op_type;
-    ir_node->op.version        = op_version;
-    ir_node->op.same_shape     = 1;
-    ir_node->op.param_size     = 0;
-    ir_node->op.param_mem      = NULL;
-    ir_node->op.infer_shape    = NULL;
-    ir_node->subgraph_idx      = -1;
+    ir_node->index = node_index;
+    ir_node->dynamic_shape = 0;
+    ir_node->input_num = 0;
+    ir_node->output_num = 0;
+    ir_node->node_type = TE_NODE_TYPE_INTER;
+    ir_node->input_tensors = NULL;
+    ir_node->output_tensors = NULL;
+    ir_node->name = NULL;
+    ir_node->op.type = op_type;
+    ir_node->op.version = op_version;
+    ir_node->op.same_shape = 1;
+    ir_node->op.param_size = 0;
+    ir_node->op.param_mem = NULL;
+    ir_node->op.infer_shape = NULL;
+    ir_node->subgraph_idx = -1;
 }
 
-
 ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_type, int op_version)
 {
     ir_node_t* node = (ir_node_t*)sys_malloc(sizeof(ir_node_t));
@@ -69,7 +67,7 @@ ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_
     init_ir_node(node, op_type, op_version, ir_graph->node_num);
 
     // check if any op param should be set
-    ir_method_t * method = find_op_method(op_type, op_version);
+    ir_method_t* method = find_op_method(op_type, op_version);
     if ((NULL != method) && (NULL != method->init) && (method->init(&node->op) < 0))
     {
         sys_free(node);
@@ -98,7 +96,6 @@ ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_
     return node;
 }
 
-
 void destroy_ir_node(struct graph* ir_graph, ir_node_t* ir_node)
 {
     if (NULL != ir_node->name)
@@ -129,10 +126,9 @@ void destroy_ir_node(struct graph* ir_graph, ir_node_t* ir_node)
     sys_free(ir_node);
 }
 
-
 char* create_ir_node_name_from_index(int index)
 {
-    char* name = ( char* )sys_malloc(16);
+    char* name = (char*)sys_malloc(16);
     if (NULL == name)
     {
         return NULL;
@@ -141,7 +137,6 @@ char* create_ir_node_name_from_index(int index)
     return name;
 }
 
-
 int get_ir_node_index_from_name(struct graph* ir_graph, const char* node_name)
 {
     ir_node_t* ir_node;
@@ -177,12 +172,11 @@ int get_ir_node_index_from_name(struct graph* ir_graph, const char* node_name)
     return -1;
 }
 
-
 int set_ir_node_input_tensor(ir_node_t* node, int input_idx, ir_tensor_t* tensor)
 {
     if (input_idx >= node->input_num)
     {
-        int16_t* new_tensor = ( int16_t* )sys_realloc(node->input_tensors, sizeof(int16_t) * (input_idx + 1));
+        int16_t* new_tensor = (int16_t*)sys_realloc(node->input_tensors, sizeof(int16_t) * (input_idx + 1));
 
         if (NULL == new_tensor)
         {
@@ -206,7 +200,6 @@ int set_ir_node_input_tensor(ir_node_t* node, int input_idx, ir_tensor_t* tensor
     return 0;
 }
 
-
 int set_ir_node_output_tensor(ir_node_t* node, int output_idx, ir_tensor_t* tensor)
 {
     if (output_idx >= node->output_num)
@@ -228,7 +221,6 @@ int set_ir_node_output_tensor(ir_node_t* node, int output_idx, ir_tensor_t* tens
     return 0;
 }
 
-
 void dump_ir_node(struct graph* ir_graph, ir_node_t* ir_node)
 {
     if (NULL != ir_node->name)
diff --git a/source/graph/node.h b/source/graph/node.h
index 7f8f8b74a..507737e32 100644
--- a/source/graph/node.h
+++ b/source/graph/node.h
@@ -31,35 +31,32 @@
 
 #include <stdint.h>
 
-
 struct node;
 struct tensor;
 struct graph;
 
-
 /*!
  * @struct ir_node_t
  * @brief  Abstract node intermediate representation
  */
 typedef struct node
 {
-    uint16_t  index;            //!< the index of a node
-    uint8_t   dynamic_shape;    //!< flag of dynamic shape
-    uint8_t   input_num;        //!< count of input tensor
-    uint8_t   output_num;       //!< count of output tensor
-    uint8_t   node_type;        //!< type of node: { input, output, intermediate }
-    int8_t    subgraph_idx;     //!< id of the owner subgraph
+    uint16_t index;        //!< the index of a node
+    uint8_t dynamic_shape; //!< flag of dynamic shape
+    uint8_t input_num;     //!< count of input tensor
+    uint8_t output_num;    //!< count of output tensor
+    uint8_t node_type;     //!< type of node: { input, output, intermediate }
+    int8_t subgraph_idx;   //!< id of the owner subgraph
 
-    uint16_t* input_tensors;    //!< id array of input tensor
-    uint16_t* output_tensors;   //!< id array of output tensor
+    uint16_t* input_tensors;  //!< id array of input tensor
+    uint16_t* output_tensors; //!< id array of output tensor
 
-    char* name;                 //!< name of a node
+    char* name; //!< name of a node
 
-    struct op op;               //!< operator of a node
-    struct graph* graph;        //!< pointer of the related graph
+    struct op op;        //!< operator of a node
+    struct graph* graph; //!< pointer of the related graph
 } ir_node_t;
 
-
 /*!
  * @brief Create a node for a graph.
  *
@@ -72,7 +69,6 @@ typedef struct node
  */
 ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_type, int op_version);
 
-
 /*!
  * @brief Destroy a node.
  *
@@ -83,7 +79,6 @@ ir_node_t* create_ir_node(struct graph* ir_graph, const char* node_name, int op_
  */
 void destroy_ir_node(struct graph* ir_graph, ir_node_t* ir_node);
 
-
 /*!
  * @brief  Set node name from id, for anonymity ones.
  *
@@ -93,7 +88,6 @@ void destroy_ir_node(struct graph* ir_graph, ir_node_t* ir_node);
  */
 char* create_ir_node_name_from_index(int index);
 
-
 /*!
  * @brief  Get node id from name, for anonymity ones.
  *
@@ -108,7 +102,6 @@ char* create_ir_node_name_from_index(int index);
  */
 int get_ir_node_index_from_name(struct graph* ir_graph, const char* node_name);
 
-
 /*!
  * @brief  Mark a tensor as node a specific input tensor.
  *
@@ -120,7 +113,6 @@ int get_ir_node_index_from_name(struct graph* ir_graph, const char* node_name);
  */
 int set_ir_node_input_tensor(ir_node_t* ir_node, int input_idx, struct tensor* tensor);
 
-
 /*!
  * @brief  Mark a tensor as node a specific output tensor.
  *
@@ -132,7 +124,6 @@ int set_ir_node_input_tensor(ir_node_t* ir_node, int input_idx, struct tensor* t
  */
 int set_ir_node_output_tensor(ir_node_t* ir_node, int output_idx, struct tensor* tensor);
 
-
 /*!
  * @brief  Dump the node.
  *
diff --git a/source/graph/subgraph.c b/source/graph/subgraph.c
index f0b619dab..41387e1ce 100644
--- a/source/graph/subgraph.c
+++ b/source/graph/subgraph.c
@@ -29,25 +29,23 @@
 #include "device/device.h"
 #include "api/c_api.h"
 
-
 void init_ir_subgraph(struct graph* graph, struct subgraph* subgraph, int index)
 {
-    subgraph->index                 = index;
-    subgraph->input_ready_count     = 0;
-    subgraph->input_wait_count      = 0;
-    subgraph->input_num             = 0;
-    subgraph->output_num            = 0;
-    subgraph->node_num              = 0;
-    subgraph->node_list             = NULL;
-    subgraph->input_tensor_list     = NULL;
-    subgraph->output_tensor_list    = NULL;
-    subgraph->graph                 = graph;
-    subgraph->device                = NULL;
-    subgraph->device_graph          = NULL;
-    subgraph->status                = GRAPH_STAT_CREATED;
+    subgraph->index = index;
+    subgraph->input_ready_count = 0;
+    subgraph->input_wait_count = 0;
+    subgraph->input_num = 0;
+    subgraph->output_num = 0;
+    subgraph->node_num = 0;
+    subgraph->node_list = NULL;
+    subgraph->input_tensor_list = NULL;
+    subgraph->output_tensor_list = NULL;
+    subgraph->graph = graph;
+    subgraph->device = NULL;
+    subgraph->device_graph = NULL;
+    subgraph->status = GRAPH_STAT_CREATED;
 }
 
-
 void release_ir_subgraph(struct graph* graph, struct subgraph* subgraph)
 {
     struct device* device = subgraph->device;
diff --git a/source/graph/subgraph.h b/source/graph/subgraph.h
index 9f7936833..1ae252c3b 100644
--- a/source/graph/subgraph.h
+++ b/source/graph/subgraph.h
@@ -30,33 +30,31 @@
 struct graph;
 struct device;
 
-
 /*!
  * @struct ir_subgraph_t
  * @brief  Abstract subgraph intermediate representation
  */
 typedef struct subgraph
 {
-    uint8_t   index;                //!< the index of a subgraph
-    uint8_t   input_ready_count;    //!< the count of all in ready input tensors
-    uint8_t   input_wait_count;     //!< the count of all out of ready input tensors
-    uint8_t   input_num;            //!< the count of input tensors
-    uint8_t   output_num;           //!< the count of output tensors
-    uint8_t   status;               //!< the execution status of subgraph
+    uint8_t index;             //!< the index of a subgraph
+    uint8_t input_ready_count; //!< the count of all in ready input tensors
+    uint8_t input_wait_count;  //!< the count of all out of ready input tensors
+    uint8_t input_num;         //!< the count of input tensors
+    uint8_t output_num;        //!< the count of output tensors
+    uint8_t status;            //!< the execution status of subgraph
 
-    uint16_t  node_num;             //!< the count of nodes in subgraph
-    uint16_t* node_list;            //!< all nodes index list of subgraph
+    uint16_t node_num;   //!< the count of nodes in subgraph
+    uint16_t* node_list; //!< all nodes index list of subgraph
 
-    uint16_t* input_tensor_list;    //!< input tensors index list of subgraph
-    uint16_t* output_tensor_list;   //!< output tensors index list of subgraph
+    uint16_t* input_tensor_list;  //!< input tensors index list of subgraph
+    uint16_t* output_tensor_list; //!< output tensors index list of subgraph
 
-    struct graph*  graph;           //!< the pointer of the related graph
+    struct graph* graph; //!< the pointer of the related graph
 
-    struct device* device;          //!< the device which will the subgraph running on
-    void*  device_graph;            //!< the related device graph
+    struct device* device; //!< the device which will the subgraph running on
+    void* device_graph;    //!< the related device graph
 } ir_subgraph_t;
 
-
 /*!
  * @brief Init a subgraph.
  *
@@ -66,7 +64,6 @@ typedef struct subgraph
  */
 void init_ir_subgraph(struct graph* graph, ir_subgraph_t* subgraph, int index);
 
-
 /*!
  * @brief Release a subgraph.
  *
diff --git a/source/graph/tensor.c b/source/graph/tensor.c
index c5d049edf..5b065a458 100644
--- a/source/graph/tensor.c
+++ b/source/graph/tensor.c
@@ -38,46 +38,43 @@
 #include <stdio.h>
 #include <string.h>
 
-
 void init_ir_tensor(ir_tensor_t* ir_tensor, int tensor_index, int data_type)
 {
+    ir_tensor->index = tensor_index;
+    ir_tensor->producer = -1;
 
-    ir_tensor->index                = tensor_index;
-    ir_tensor->producer             = -1;
-
-    ir_tensor->consumer = ( int16_t* )sys_malloc(sizeof(int16_t) * TE_MAX_CONSUMER_NUM);
+    ir_tensor->consumer = (int16_t*)sys_malloc(sizeof(int16_t) * TE_MAX_CONSUMER_NUM);
     for (int i = 0; i < TE_MAX_CONSUMER_NUM; i++)
     {
         ir_tensor->consumer[i] = -1;
     }
 
-    ir_tensor->reshaped            = 0;
+    ir_tensor->reshaped = 0;
     ir_tensor->consumer_num = 0;
-    ir_tensor->tensor_type         = TENSOR_TYPE_VAR;
-    ir_tensor->data_type           = data_type;
-    ir_tensor->dim_num             = 0;
-    ir_tensor->elem_size           = get_tenser_element_size(data_type);
-    ir_tensor->subgraph_num        = 0;
-    ir_tensor->free_host_mem       = 0;
-    ir_tensor->internal_allocated  = 1;
-    ir_tensor->layout              = TENGINE_LAYOUT_NCHW;
-    ir_tensor->quant_param_num     = 0;
-    ir_tensor->elem_num            = 0;
+    ir_tensor->tensor_type = TENSOR_TYPE_VAR;
+    ir_tensor->data_type = data_type;
+    ir_tensor->dim_num = 0;
+    ir_tensor->elem_size = get_tenser_element_size(data_type);
+    ir_tensor->subgraph_num = 0;
+    ir_tensor->free_host_mem = 0;
+    ir_tensor->internal_allocated = 1;
+    ir_tensor->layout = TENGINE_LAYOUT_NCHW;
+    ir_tensor->quant_param_num = 0;
+    ir_tensor->elem_num = 0;
 
     for (int i = 0; i < MAX_SHAPE_DIM_NUM; i++)
     {
         ir_tensor->dims[i] = 0;
     }
 
-    ir_tensor->data                = NULL;
-    ir_tensor->name                = NULL;
-    ir_tensor->scale_list          = NULL;
-    ir_tensor->zp_list             = NULL;
-    ir_tensor->dev_mem             = NULL;
-    ir_tensor->subgraph_list       = NULL;
+    ir_tensor->data = NULL;
+    ir_tensor->name = NULL;
+    ir_tensor->scale_list = NULL;
+    ir_tensor->zp_list = NULL;
+    ir_tensor->dev_mem = NULL;
+    ir_tensor->subgraph_list = NULL;
 }
 
-
 ir_tensor_t* create_ir_tensor(ir_graph_t* ir_graph, const char* tensor_name, int data_type)
 {
     ir_tensor_t* ir_tensor = (ir_tensor_t*)sys_malloc(sizeof(ir_tensor_t));
@@ -122,7 +119,6 @@ ir_tensor_t* create_ir_tensor(ir_graph_t* ir_graph, const char* tensor_name, int
     return ir_tensor;
 }
 
-
 void destroy_ir_tensor(ir_graph_t* ir_graph, ir_tensor_t* ir_tensor)
 {
     if (ir_tensor->quant_param_num > 1)
@@ -166,7 +162,6 @@ void destroy_ir_tensor(ir_graph_t* ir_graph, ir_tensor_t* ir_tensor)
     sys_free(ir_tensor);
 }
 
-
 int set_ir_tensor_shape(ir_tensor_t* tensor, const int dims[], int dim_number)
 {
     if (MAX_SHAPE_DIM_NUM + 1 < dim_number)
@@ -194,7 +189,6 @@ int set_ir_tensor_shape(ir_tensor_t* tensor, const int dims[], int dim_number)
     return 0;
 }
 
-
 char* create_ir_tensor_name_from_index(int index)
 {
     char* name = (char*)sys_malloc(TE_COMMON_ALIGN_SIZE * 2);
@@ -208,7 +202,6 @@ char* create_ir_tensor_name_from_index(int index)
     return name;
 }
 
-
 int get_ir_tensor_index_from_name(ir_graph_t* graph, const char* tensor_name)
 {
     const char* last_symbol_ptr = strrchr(tensor_name, '_');
@@ -242,7 +235,6 @@ int get_ir_tensor_index_from_name(ir_graph_t* graph, const char* tensor_name)
     return -1;
 }
 
-
 int set_ir_tensor_quantization_parameter(ir_tensor_t* tensor, const float* scale, const int* zero_point, int number)
 {
     if (NULL == scale || NULL == zero_point)
@@ -284,7 +276,6 @@ int set_ir_tensor_quantization_parameter(ir_tensor_t* tensor, const float* scale
     return 0;
 }
 
-
 int get_ir_tensor_quantization_parameter(ir_tensor_t* tensor, float* scale, int* zero_point, int number)
 {
     if (number < tensor->quant_param_num)
@@ -306,7 +297,6 @@ int get_ir_tensor_quantization_parameter(ir_tensor_t* tensor, float* scale, int*
     return tensor->quant_param_num;
 }
 
-
 void dump_ir_tensor(ir_graph_t* g, ir_tensor_t* t)
 {
     if (NULL != t->name)
@@ -355,7 +345,7 @@ int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index)
 {
     if (TE_MAX_CONSUMER_NUM <= ir_tensor->consumer_num)
     {
-        int16_t* new_consumer = ( int16_t* )sys_realloc(ir_tensor->consumer, sizeof(int16_t) * (ir_tensor->consumer_num + 1));
+        int16_t* new_consumer = (int16_t*)sys_realloc(ir_tensor->consumer, sizeof(int16_t) * (ir_tensor->consumer_num + 1));
         if (NULL == new_consumer)
         {
             return -1;
@@ -368,4 +358,4 @@ int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index)
     ir_tensor->consumer_num++;
 
     return 0;
-} 
+}
diff --git a/source/graph/tensor.h b/source/graph/tensor.h
index c92a7943e..601d23467 100644
--- a/source/graph/tensor.h
+++ b/source/graph/tensor.h
@@ -36,31 +36,30 @@ extern "C" {
 struct node;
 struct graph;
 
-
 /*!
  * @struct ir_tensor_t
  * @brief  Abstract tensor intermediate representation
  */
 typedef struct tensor
 {
-    uint16_t index;                          //!< the index of a tensor
-    int16_t  producer;                       //!< node id, '-1' means no producer
-    int16_t* consumer;                       //!< consumer nodes array
-
-    uint8_t  reshaped;                       //!< the tensor's shape has changed
-    uint8_t  consumer_num;                   //!< count of consumer nodes
-    uint8_t  tensor_type;                    //!< tensor_type: { const, input, var, dep }
-    uint8_t  data_type;                      //!< data_type: { int8, uint8, fp32, fp16, int32 }
-    uint8_t  dim_num;                        //!< count of dimensions
-    uint8_t  elem_size;                      //!< size of single element
-    uint8_t  subgraph_num;                   //!< count of all subgraph those will waiting this tensor ready
-    uint8_t  free_host_mem;                  //!< should free host memory?
-    uint8_t  internal_allocated;             //!< how memory is allocated?
-    uint8_t  layout;                         //!< tensor layout: { TENGINE_LAYOUT_NCHW, TENGINE_LAYOUT_NHWC }
-
-    uint16_t quant_param_num;                //!< quantization dimension
-    uint32_t elem_num;                       //!< count of total elements
-    int dims[TE_MAX_SHAPE_DIM_NUM];          //!< shape dimensions
+    uint16_t index;    //!< the index of a tensor
+    int16_t producer;  //!< node id, '-1' means no producer
+    int16_t* consumer; //!< consumer nodes array
+
+    uint8_t reshaped;           //!< the tensor's shape has changed
+    uint8_t consumer_num;       //!< count of consumer nodes
+    uint8_t tensor_type;        //!< tensor_type: { const, input, var, dep }
+    uint8_t data_type;          //!< data_type: { int8, uint8, fp32, fp16, int32 }
+    uint8_t dim_num;            //!< count of dimensions
+    uint8_t elem_size;          //!< size of single element
+    uint8_t subgraph_num;       //!< count of all subgraph those will waiting this tensor ready
+    uint8_t free_host_mem;      //!< should free host memory?
+    uint8_t internal_allocated; //!< how memory is allocated?
+    uint8_t layout;             //!< tensor layout: { TENGINE_LAYOUT_NCHW, TENGINE_LAYOUT_NHWC }
+
+    uint16_t quant_param_num;       //!< quantization dimension
+    uint32_t elem_num;              //!< count of total elements
+    int dims[TE_MAX_SHAPE_DIM_NUM]; //!< shape dimensions
 
     /*!
      * @union anonymity data pointer
@@ -68,15 +67,15 @@ typedef struct tensor
      */
     union
     {
-        void*    data;
-        int8_t*    i8;
-        uint8_t*   u8;
-        float*    f32;
-        uint16_t*   f16;
-        int32_t*  i32;
+        void* data;
+        int8_t* i8;
+        uint8_t* u8;
+        float* f32;
+        uint16_t* f16;
+        int32_t* i32;
     };
 
-    char* name;                             //!< tensor name
+    char* name; //!< tensor name
 
     /*!
      * @union anonymity quantization scale union
@@ -85,7 +84,7 @@ typedef struct tensor
     union
     {
         float* scale_list;
-        float  scale;
+        float scale;
     };
 
     /*!
@@ -94,15 +93,14 @@ typedef struct tensor
      */
     union
     {
-        int  zero_point;
+        int zero_point;
         int* zp_list;
     };
 
     struct dev_mem* dev_mem;
-    uint8_t* subgraph_list;                 //!< subgraph index list of those subgraph will waiting this tensor ready
+    uint8_t* subgraph_list; //!< subgraph index list of those subgraph will waiting this tensor ready
 } ir_tensor_t;
 
-
 /*!
  * @brief Create a tensor for a graph.
  *
@@ -114,7 +112,6 @@ typedef struct tensor
  */
 ir_tensor_t* create_ir_tensor(struct graph* graph, const char* tensor_name, int data_type);
 
-
 /*!
  * @brief Destroy a tensor.
  *
@@ -125,7 +122,6 @@ ir_tensor_t* create_ir_tensor(struct graph* graph, const char* tensor_name, int
  */
 void destroy_ir_tensor(struct graph* ir_graph, ir_tensor_t* ir_tensor);
 
-
 /*!
  * @brief  Set shape for a tensor.
  *
@@ -137,7 +133,6 @@ void destroy_ir_tensor(struct graph* ir_graph, ir_tensor_t* ir_tensor);
  */
 int set_ir_tensor_shape(ir_tensor_t* ir_tensor, const int dims[], int dim_number);
 
-
 /*!
  * @brief  Set tensor name from id, for anonymity ones.
  *
@@ -147,7 +142,6 @@ int set_ir_tensor_shape(ir_tensor_t* ir_tensor, const int dims[], int dim_number
  */
 char* create_ir_tensor_name_from_index(int index);
 
-
 /*!
  * @brief  Get tensor id from name, for anonymity ones.
  *
@@ -158,7 +152,6 @@ char* create_ir_tensor_name_from_index(int index);
  */
 int get_ir_tensor_index_from_name(struct graph* ir_graph, const char* tensor_name);
 
-
 /*!
  * @brief  Set tensor quantization parameter.
  *
@@ -171,7 +164,6 @@ int get_ir_tensor_index_from_name(struct graph* ir_graph, const char* tensor_nam
  */
 int set_ir_tensor_quantization_parameter(ir_tensor_t* ir_tensor, const float* scale, const int* zero_point, int number);
 
-
 /*!
  * @brief  Get tensor quantization parameter.
  *
@@ -184,7 +176,6 @@ int set_ir_tensor_quantization_parameter(ir_tensor_t* ir_tensor, const float* sc
  */
 int get_ir_tensor_quantization_parameter(ir_tensor_t* ir_tensor, float* scale, int* zero_point, int number);
 
-
 /*!
  * @brief  Dump the tensor.
  *
@@ -201,7 +192,7 @@ void dump_ir_tensor(struct graph* ir_graph, ir_tensor_t* ir_tensor);
  *
  * @return statue value, 0 success, other value failure.
  */
-int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index); 
+int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index);
 
 #ifdef __cplusplus
 }
diff --git a/source/module/module.c b/source/module/module.c
index 2a945a285..f3e4398bb 100644
--- a/source/module/module.c
+++ b/source/module/module.c
@@ -35,12 +35,10 @@
 #include <stddef.h>
 #include <string.h>
 
-
-static vector_t* internal_serializer_registry = NULL;   //!< registry of model serializer
-static vector_t* internal_device_registry     = NULL;   //!< registry of runnable neural network device
-static vector_t* internal_op_method_registry  = NULL;   //!< registry of operators
-static vector_t* internal_op_name_registry    = NULL;   //!< registry of operators name
-
+static vector_t* internal_serializer_registry = NULL; //!< registry of model serializer
+static vector_t* internal_device_registry = NULL;     //!< registry of runnable neural network device
+static vector_t* internal_op_method_registry = NULL;  //!< registry of operators
+static vector_t* internal_op_name_registry = NULL;    //!< registry of operators name
 
 /*!
  * @struct ir_op_map_t
@@ -48,14 +46,12 @@ static vector_t* internal_op_name_registry    = NULL;   //!< registry of operato
  */
 typedef struct op_name_entry
 {
-    int type;               //!< the type of a operator
-    const char* name;       //!< the name of a operator
+    int type;         //!< the type of a operator
+    const char* name; //!< the name of a operator
 } ir_op_name_entry_t;
 
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 static int initialize_serializer_registry(const char* name)
 {
     if (NULL == internal_serializer_registry)
@@ -71,7 +67,6 @@ static int initialize_serializer_registry(const char* name)
     return 0;
 }
 
-
 int register_serializer(serializer_t* serializer)
 {
     initialize_serializer_registry(serializer->get_name(serializer));
@@ -101,7 +96,6 @@ int register_serializer(serializer_t* serializer)
     return 0;
 }
 
-
 serializer_t* find_serializer_via_name(const char* name)
 {
     if (NULL == internal_serializer_registry)
@@ -131,7 +125,6 @@ serializer_t* find_serializer_via_name(const char* name)
     return NULL;
 }
 
-
 serializer_t* find_serializer_via_index(int index)
 {
     int count = get_serializer_count();
@@ -147,7 +140,6 @@ serializer_t* find_serializer_via_index(int index)
     }
 }
 
-
 int get_serializer_count()
 {
     if (NULL == internal_serializer_registry)
@@ -160,7 +152,6 @@ int get_serializer_count()
     }
 }
 
-
 int unregister_serializer(serializer_t* serializer)
 {
     if (NULL == serializer)
@@ -194,7 +185,6 @@ int unregister_serializer(serializer_t* serializer)
     return remove_vector_via_pointer(internal_serializer_registry, &serializer);
 }
 
-
 int release_serializer_registry()
 {
     while (get_vector_num(internal_serializer_registry) > 0)
@@ -209,10 +199,8 @@ int release_serializer_registry()
     return 0;
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 static int initialize_device_registry(const char* name)
 {
     if (NULL == internal_device_registry)
@@ -228,7 +216,6 @@ static int initialize_device_registry(const char* name)
     return 0;
 }
 
-
 ir_device_t* find_device_via_name(const char* name)
 {
     if (NULL == internal_device_registry)
@@ -258,13 +245,11 @@ ir_device_t* find_device_via_name(const char* name)
     return NULL;
 }
 
-
 struct device* find_default_device()
 {
     return find_device_via_name("CPU");
 }
 
-
 ir_device_t* find_device_via_index(int index)
 {
     int count = get_device_count();
@@ -280,7 +265,6 @@ ir_device_t* find_device_via_index(int index)
     }
 }
 
-
 int get_device_count()
 {
     if (NULL == internal_device_registry)
@@ -293,7 +277,6 @@ int get_device_count()
     }
 }
 
-
 int register_device(ir_device_t* device)
 {
     initialize_device_registry(device->name);
@@ -323,7 +306,6 @@ int register_device(ir_device_t* device)
     return 0;
 }
 
-
 int unregister_device(ir_device_t* device)
 {
     if (NULL == find_device_via_name(device->name))
@@ -339,7 +321,6 @@ int unregister_device(ir_device_t* device)
     return remove_vector_via_pointer(internal_device_registry, &device);
 }
 
-
 int release_device_registry()
 {
     while (get_vector_num(internal_device_registry) > 0)
@@ -354,10 +335,8 @@ int release_device_registry()
     return 0;
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 int initialize_op_name_registry(const char* name)
 {
     if (NULL == internal_op_name_registry)
@@ -374,7 +353,6 @@ int initialize_op_name_registry(const char* name)
     return 0;
 }
 
-
 int register_op_name(int type, const char* name)
 {
     initialize_op_name_registry(name);
@@ -392,7 +370,6 @@ int register_op_name(int type, const char* name)
     return push_vector_data(internal_op_name_registry, &op_map);
 }
 
-
 int unregister_op_name(int type)
 {
     int i;
@@ -415,7 +392,6 @@ int unregister_op_name(int type)
     return 0;
 }
 
-
 int release_op_name_registry()
 {
     while (get_vector_num(internal_op_name_registry) > 0)
@@ -430,7 +406,6 @@ int release_op_name_registry()
     return 0;
 }
 
-
 static int initialize_op_registry(const char* name)
 {
     if (NULL == internal_op_method_registry)
@@ -446,7 +421,6 @@ static int initialize_op_registry(const char* name)
     return 0;
 }
 
-
 static int register_op_registry(ir_method_t* method)
 {
     if (find_op_method(method->type, method->version))
@@ -457,7 +431,6 @@ static int register_op_registry(ir_method_t* method)
     return push_vector_data(internal_op_method_registry, method);
 }
 
-
 int register_op(int type, const char* name, ir_method_t* method)
 {
     initialize_op_registry(name);
@@ -485,7 +458,6 @@ int register_op(int type, const char* name, ir_method_t* method)
     return 0;
 }
 
-
 ir_method_t* find_op_method(int type, int version)
 {
     int op_count = get_vector_num(internal_op_method_registry);
@@ -503,7 +475,6 @@ ir_method_t* find_op_method(int type, int version)
     return NULL;
 }
 
-
 ir_method_t* find_op_method_via_index(int index)
 {
     int count = get_op_method_count();
@@ -519,7 +490,6 @@ ir_method_t* find_op_method_via_index(int index)
     }
 }
 
-
 const char* find_op_name(int type)
 {
     int count = get_vector_num(internal_op_name_registry);
@@ -535,7 +505,6 @@ const char* find_op_name(int type)
     return NULL;
 }
 
-
 int get_op_method_count()
 {
     if (NULL == internal_op_method_registry)
@@ -548,7 +517,6 @@ int get_op_method_count()
     }
 }
 
-
 int unregister_op(int type, int version)
 {
     int matched_count = 0;
@@ -587,7 +555,6 @@ int unregister_op(int type, int version)
     return 0;
 }
 
-
 int release_op_registry(void)
 {
     while (get_vector_num(internal_op_method_registry) > 0)
diff --git a/source/module/module.h b/source/module/module.h
index 03f5560ef..36a12bf19 100644
--- a/source/module/module.h
+++ b/source/module/module.h
@@ -29,7 +29,6 @@ struct op;
 struct method;
 struct device;
 
-
 /*!
  * @brief Register a serializer.
  *
@@ -39,7 +38,6 @@ struct device;
  */
 int register_serializer(struct serializer* serializer);
 
-
 /*!
  * @brief Find the serializer via its name.
  *
@@ -49,7 +47,6 @@ int register_serializer(struct serializer* serializer);
  */
 struct serializer* find_serializer_via_name(const char* name);
 
-
 /*!
  * @brief Find the serializer via its registered index.
  *
@@ -59,7 +56,6 @@ struct serializer* find_serializer_via_name(const char* name);
  */
 struct serializer* find_serializer_via_index(int index);
 
-
 /*!
  * @brief Get count of all registered serializer.
  *
@@ -67,7 +63,6 @@ struct serializer* find_serializer_via_index(int index);
  */
 int get_serializer_count();
 
-
 /*!
  * @brief Unregister a serializer.
  *
@@ -77,7 +72,6 @@ int get_serializer_count();
  */
 int unregister_serializer(struct serializer* serializer);
 
-
 /*!
  * @brief Release all serializer.
  *
@@ -85,7 +79,6 @@ int unregister_serializer(struct serializer* serializer);
  */
 int release_serializer_registry();
 
-
 /*!
  * @brief Register a device.
  *
@@ -95,7 +88,6 @@ int release_serializer_registry();
  */
 int register_device(struct device* device);
 
-
 /*!
  * @brief Find the device via its name.
  *
@@ -105,7 +97,6 @@ int register_device(struct device* device);
  */
 struct device* find_device_via_name(const char* name);
 
-
 /*!
  * @brief Find the default device.
  *
@@ -113,7 +104,6 @@ struct device* find_device_via_name(const char* name);
  */
 struct device* find_default_device();
 
-
 /*!
  * @brief Find the device via its registered index.
  *
@@ -123,7 +113,6 @@ struct device* find_default_device();
  */
 struct device* find_device_via_index(int index);
 
-
 /*!
  * @brief Get count of all registered device.
  *
@@ -131,7 +120,6 @@ struct device* find_device_via_index(int index);
  */
 int get_device_count();
 
-
 /*!
  * @brief Register a device.
  *
@@ -141,7 +129,6 @@ int get_device_count();
  */
 int unregister_device(struct device* device);
 
-
 /*!
  * @brief Release all device.
  *
@@ -149,7 +136,6 @@ int unregister_device(struct device* device);
  */
 int release_device_registry();
 
-
 /*!
  * @brief Register an operator method.
  *
@@ -161,7 +147,6 @@ int release_device_registry();
  */
 int register_op(int type, const char* name, struct method* method);
 
-
 /*!
  * @brief Find an operator method.
  *
@@ -172,7 +157,6 @@ int register_op(int type, const char* name, struct method* method);
  */
 struct method* find_op_method(int type, int version);
 
-
 /*!
  * @brief Find an operator method via its registered index.
  *
@@ -182,7 +166,6 @@ struct method* find_op_method(int type, int version);
  */
 struct method* find_op_method_via_index(int index);
 
-
 /*!
  * @brief Find an operator name.
  *
@@ -192,7 +175,6 @@ struct method* find_op_method_via_index(int index);
  */
 const char* find_op_name(int type);
 
-
 /*!
  * @brief Get count of all registered operator method.
  *
@@ -200,7 +182,6 @@ const char* find_op_name(int type);
  */
 int get_op_method_count();
 
-
 /*!
  * @brief Register an operator.
  *
@@ -211,7 +192,6 @@ int get_op_method_count();
  */
 int unregister_op(int type, int version);
 
-
 /*!
  * @brief Release all operator.
  *
diff --git a/source/operator/op.c b/source/operator/op.c
index 051339885..fb29d670c 100644
--- a/source/operator/op.c
+++ b/source/operator/op.c
@@ -29,13 +29,11 @@
 
 #include <string.h>
 
-
 void init_op_struct(ir_op_t* op)
 {
     memset(op, 0, sizeof(ir_node_t));
 }
 
-
 void init_method_struct(ir_method_t* method)
 {
     memset(method, 0, sizeof(ir_method_t));
diff --git a/source/operator/op.h b/source/operator/op.h
index 36b55fe15..129fae65f 100644
--- a/source/operator/op.h
+++ b/source/operator/op.h
@@ -29,10 +29,8 @@
 
 #include "op_name.h"
 
-
 struct node;
 
-
 /*!
  * @enum  op_type
  * @brief Enumeration of supported operators
@@ -53,7 +51,7 @@ enum
     OP_CLIP,
     OP_COMPARISON,
     OP_CONCAT,
-    OP_CONST,    
+    OP_CONST,
     OP_CONV,
     OP_CROP,
     OP_DECONV,
@@ -144,37 +142,32 @@ enum
     OP_BUILTIN_LAST
 };
 
-
 /*!
  * @struct ir_op_t
  * @brief  Abstract operator intermediate representation
  */
 typedef struct op
 {
-    uint16_t type;                          //!< the type of a operator
-    uint8_t  version;                       //!< the version of a operator
-    uint8_t  same_shape;                    //!< the flag of whether the operator will keep shape
-    uint16_t param_size;                    //!< size of parameter memory buffer
-    void* param_mem;                        //!< parameter memory buffer
-    int (*infer_shape)(struct node*);       //!< infer(or broadcast) the shape from input to output(s)
+    uint16_t type;                    //!< the type of a operator
+    uint8_t version;                  //!< the version of a operator
+    uint8_t same_shape;               //!< the flag of whether the operator will keep shape
+    uint16_t param_size;              //!< size of parameter memory buffer
+    void* param_mem;                  //!< parameter memory buffer
+    int (*infer_shape)(struct node*); //!< infer(or broadcast) the shape from input to output(s)
 } ir_op_t;
 
-
 /*!
  * @struct ir_op_method_t
  * @brief  Abstract method of operator intermediate representation
  */
 typedef struct method
 {
-
-    int  type;                              //!< the type of a operator
-    int  version;                           //!< the version of a operator
-    int  (*init)(ir_op_t* op);
+    int type;    //!< the type of a operator
+    int version; //!< the version of a operator
+    int (*init)(ir_op_t* op);
     void (*release)(ir_op_t* op);
 } ir_method_t;
 
-
 void init_op_struct(ir_op_t* op);
 
-
 void init_method_struct(ir_method_t* method);
diff --git a/source/operator/op_name.h b/source/operator/op_name.h
index f90c431c6..068045dd2 100644
--- a/source/operator/op_name.h
+++ b/source/operator/op_name.h
@@ -24,105 +24,105 @@
 
 #pragma once
 
-#define OP_GENERIC_NAME                             "Generic"
-#define OP_ABSVAL_NAME                              "Absval"
-#define OP_ADD_N_NAME                               "Add_n"
-#define OP_ARGMAX_NAME                              "ArgMax"
-#define OP_ARGMIN_NAME                              "ArgMin"
-#define OP_BATCHNORM_NAME                           "BatchNormalize"
-#define OP_BATCHTOSPACEND_NAME                      "Batchtospacend"
-#define OP_BIAS_NAME                                "Bias"
-#define OP_BROADMUL_NAME                            "BroadMul"
-#define OP_CAST_NAME                                "Cast"
-#define OP_CEIL_NAME                                "Ceil"
-#define OP_CLIP_NAME                                "Clip"
-#define OP_COMPARISON_NAME                          "Comparison"
-#define OP_CONCAT_NAME                              "Concat"
-#define OP_CONV_NAME                                "Convolution"
-#define OP_CONST_NAME                               "Const"
-#define OP_CROP_NAME                                "Crop"
-#define OP_DECONV_NAME                              "Deconvolution"
-#define OP_DEPTHTOSPACE_NAME                        "Depthtospace"
-#define OP_DETECTION_OUTPUT_NAME                    "DetectionOutput"
-#define OP_DETECTION_POSTPROCESS_NAME               "DetectionPostProcess"
-#define OP_DROPOUT_NAME                             "Dropout"
-#define OP_ELTWISE_NAME                             "Eltwise"
-#define OP_ELU_NAME                                 "Elu"
-#define OP_EMBEDDING_NAME                           "Embedding"
-#define OP_EXPANDDIMS_NAME                          "Expanddims"
-#define OP_FC_NAME                                  "FullyConnected"
-#define OP_FLATTEN_NAME                             "Flatten"
-#define OP_GATHER_NAME                              "Gather"
-#define OP_GEMM_NAME                                "Gemm"
-#define OP_GRU_NAME                                 "Gru"
-#define OP_HARDSIGMOID_NAME                         "HardSigmoid"
-#define OP_HARDSWISH_NAME                           "Hardswish"
-#define OP_INPUT_NAME                               "InputOp"
-#define OP_INSTANCENORM_NAME                        "InstanceNorm"
-#define OP_INTERP_NAME                              "Interp"
-#define OP_LOGICAL_NAME                             "Logical"
-#define OP_LOGISTIC_NAME                            "Logistic"
-#define OP_LRN_NAME                                 "Lrn"
-#define OP_LSTM_NAME                                "Lstm"
-#define OP_MATMUL_NAME                              "Matmul"
-#define OP_MAXIMUM_NAME                             "Maximum"
-#define OP_MEAN_NAME                                "Mean"
-#define OP_MINIMUM_NAME                             "Minimum"
-#define OP_MVN_NAME                                 "Mvn"
-#define OP_NOOP_NAME                                "Noop"
-#define OP_NORMALIZE_NAME                           "Normalize"
-#define OP_PAD_NAME                                 "Pad"
-#define OP_PERMUTE_NAME                             "Permute"
-#define OP_POOL_NAME                                "Pooling"
-#define OP_PRELU_NAME                               "PReLU"
-#define OP_PRIORBOX_NAME                            "PriorBox"
-#define OP_PSROIPOOLING_NAME                        "Psroipooling"
-#define OP_REDUCEL2_NAME                            "ReduceL2"
-#define OP_REDUCTION_NAME                           "Reduction"
-#define OP_REGION_NAME                              "Region"
-#define OP_RELU_NAME                                "ReLU"
-#define OP_RELU6_NAME                               "ReLU6"
-#define OP_REORG_NAME                               "Reorg"
-#define OP_RESHAPE_NAME                             "Reshape"
-#define OP_RESIZE_NAME                              "Resize"
-#define OP_REVERSE_NAME                             "Reverse"
-#define OP_RNN_NAME                                 "RNN"
-#define OP_ROIALIGN_NAME                            "Roialign"
-#define OP_ROIPOOLING_NAME                          "RoiPooling"
-#define OP_ROUND_NAME                               "Round"
-#define OP_RPN_NAME                                 "Rpn"
-#define OP_SCALE_NAME                               "Scale"
-#define OP_SELU_NAME                                "Selu"
-#define OP_SHUFFLECHANNEL_NAME                      "ShuffleChannel"
-#define OP_SIGMOID_NAME                             "Sigmoid"
-#define OP_SLICE_NAME                               "Slice"
-#define OP_SOFTMAX_NAME                             "Softmax"
-#define OP_SPACETOBATCHND_NAME                      "Spacetobatchnd"
-#define OP_SPACETODEPTH_NAME                        "Spacetodepth"
-#define OP_SPARSETODENSE_NAME                       "SparseToDense"
-#define OP_SPLIT_NAME                               "Split"
-#define OP_SQUAREDDIFFERENCE_NAME                   "SquaredDifference"
-#define OP_SQUEEZE_NAME                             "Squeeze"
-#define OP_STRIDEDSLICE_NAME                        "StridedSlice"
-#define OP_SWAP_AXIS_NAME                           "SwapAxis"
-#define OP_TANH_NAME                                "Tanh"
-#define OP_THRESHOLD_NAME                           "Threshold"
-#define OP_TOPKV2_NAME                              "Topkv2"
-#define OP_TRANSPOSE_NAME                           "Transpose"
-#define OP_UNARY_NAME                               "Unary"
-#define OP_UNSQUEEZE_NAME                           "Unsqueeze"
-#define OP_UPSAMPLE_NAME                            "Upsample"
-#define OP_ZEROSLIKE_NAME                           "ZerosLike"
-#define OP_MISH_NAME                                "Mish"
-#define OP_LOGSOFTMAX_NAME                          "LogSoftmax"
-#define OP_RELU1_NAME                               "ReLU1"
-#define OP_L2NORMALIZATION_NAME                     "L2Normalization"
-#define OP_L2POOL_NAME                              "L2Pool"
-#define OP_TILE_NAME                                "Tile"
-#define OP_SHAPE_NAME                               "Shape"
-#define OP_SCATTER_NAME                             "Scatter"
-#define OP_WHERE_NAME                               "Where"
-#define OP_SOFTPLUS_NAME                            "Softplus"
-#define OP_RECIPROCAL_NAME                          "Reciprocal"
-#define OP_SPATIALTRANSFORMER_NAME                  "SpatialTransformer"
-#define OP_EXPAND_NAME                              "Expand"
+#define OP_GENERIC_NAME               "Generic"
+#define OP_ABSVAL_NAME                "Absval"
+#define OP_ADD_N_NAME                 "Add_n"
+#define OP_ARGMAX_NAME                "ArgMax"
+#define OP_ARGMIN_NAME                "ArgMin"
+#define OP_BATCHNORM_NAME             "BatchNormalize"
+#define OP_BATCHTOSPACEND_NAME        "Batchtospacend"
+#define OP_BIAS_NAME                  "Bias"
+#define OP_BROADMUL_NAME              "BroadMul"
+#define OP_CAST_NAME                  "Cast"
+#define OP_CEIL_NAME                  "Ceil"
+#define OP_CLIP_NAME                  "Clip"
+#define OP_COMPARISON_NAME            "Comparison"
+#define OP_CONCAT_NAME                "Concat"
+#define OP_CONV_NAME                  "Convolution"
+#define OP_CONST_NAME                 "Const"
+#define OP_CROP_NAME                  "Crop"
+#define OP_DECONV_NAME                "Deconvolution"
+#define OP_DEPTHTOSPACE_NAME          "Depthtospace"
+#define OP_DETECTION_OUTPUT_NAME      "DetectionOutput"
+#define OP_DETECTION_POSTPROCESS_NAME "DetectionPostProcess"
+#define OP_DROPOUT_NAME               "Dropout"
+#define OP_ELTWISE_NAME               "Eltwise"
+#define OP_ELU_NAME                   "Elu"
+#define OP_EMBEDDING_NAME             "Embedding"
+#define OP_EXPANDDIMS_NAME            "Expanddims"
+#define OP_FC_NAME                    "FullyConnected"
+#define OP_FLATTEN_NAME               "Flatten"
+#define OP_GATHER_NAME                "Gather"
+#define OP_GEMM_NAME                  "Gemm"
+#define OP_GRU_NAME                   "Gru"
+#define OP_HARDSIGMOID_NAME           "HardSigmoid"
+#define OP_HARDSWISH_NAME             "Hardswish"
+#define OP_INPUT_NAME                 "InputOp"
+#define OP_INSTANCENORM_NAME          "InstanceNorm"
+#define OP_INTERP_NAME                "Interp"
+#define OP_LOGICAL_NAME               "Logical"
+#define OP_LOGISTIC_NAME              "Logistic"
+#define OP_LRN_NAME                   "Lrn"
+#define OP_LSTM_NAME                  "Lstm"
+#define OP_MATMUL_NAME                "Matmul"
+#define OP_MAXIMUM_NAME               "Maximum"
+#define OP_MEAN_NAME                  "Mean"
+#define OP_MINIMUM_NAME               "Minimum"
+#define OP_MVN_NAME                   "Mvn"
+#define OP_NOOP_NAME                  "Noop"
+#define OP_NORMALIZE_NAME             "Normalize"
+#define OP_PAD_NAME                   "Pad"
+#define OP_PERMUTE_NAME               "Permute"
+#define OP_POOL_NAME                  "Pooling"
+#define OP_PRELU_NAME                 "PReLU"
+#define OP_PRIORBOX_NAME              "PriorBox"
+#define OP_PSROIPOOLING_NAME          "Psroipooling"
+#define OP_REDUCEL2_NAME              "ReduceL2"
+#define OP_REDUCTION_NAME             "Reduction"
+#define OP_REGION_NAME                "Region"
+#define OP_RELU_NAME                  "ReLU"
+#define OP_RELU6_NAME                 "ReLU6"
+#define OP_REORG_NAME                 "Reorg"
+#define OP_RESHAPE_NAME               "Reshape"
+#define OP_RESIZE_NAME                "Resize"
+#define OP_REVERSE_NAME               "Reverse"
+#define OP_RNN_NAME                   "RNN"
+#define OP_ROIALIGN_NAME              "Roialign"
+#define OP_ROIPOOLING_NAME            "RoiPooling"
+#define OP_ROUND_NAME                 "Round"
+#define OP_RPN_NAME                   "Rpn"
+#define OP_SCALE_NAME                 "Scale"
+#define OP_SELU_NAME                  "Selu"
+#define OP_SHUFFLECHANNEL_NAME        "ShuffleChannel"
+#define OP_SIGMOID_NAME               "Sigmoid"
+#define OP_SLICE_NAME                 "Slice"
+#define OP_SOFTMAX_NAME               "Softmax"
+#define OP_SPACETOBATCHND_NAME        "Spacetobatchnd"
+#define OP_SPACETODEPTH_NAME          "Spacetodepth"
+#define OP_SPARSETODENSE_NAME         "SparseToDense"
+#define OP_SPLIT_NAME                 "Split"
+#define OP_SQUAREDDIFFERENCE_NAME     "SquaredDifference"
+#define OP_SQUEEZE_NAME               "Squeeze"
+#define OP_STRIDEDSLICE_NAME          "StridedSlice"
+#define OP_SWAP_AXIS_NAME             "SwapAxis"
+#define OP_TANH_NAME                  "Tanh"
+#define OP_THRESHOLD_NAME             "Threshold"
+#define OP_TOPKV2_NAME                "Topkv2"
+#define OP_TRANSPOSE_NAME             "Transpose"
+#define OP_UNARY_NAME                 "Unary"
+#define OP_UNSQUEEZE_NAME             "Unsqueeze"
+#define OP_UPSAMPLE_NAME              "Upsample"
+#define OP_ZEROSLIKE_NAME             "ZerosLike"
+#define OP_MISH_NAME                  "Mish"
+#define OP_LOGSOFTMAX_NAME            "LogSoftmax"
+#define OP_RELU1_NAME                 "ReLU1"
+#define OP_L2NORMALIZATION_NAME       "L2Normalization"
+#define OP_L2POOL_NAME                "L2Pool"
+#define OP_TILE_NAME                  "Tile"
+#define OP_SHAPE_NAME                 "Shape"
+#define OP_SCATTER_NAME               "Scatter"
+#define OP_WHERE_NAME                 "Where"
+#define OP_SOFTPLUS_NAME              "Softplus"
+#define OP_RECIPROCAL_NAME            "Reciprocal"
+#define OP_SPATIALTRANSFORMER_NAME    "SpatialTransformer"
+#define OP_EXPAND_NAME                "Expand"
diff --git a/source/operator/prototype/absval.c b/source/operator/prototype/absval.c
index bd3367f01..2565ecfbf 100644
--- a/source/operator/prototype/absval.c
+++ b/source/operator/prototype/absval.c
@@ -28,7 +28,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -40,7 +39,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->param_mem = NULL;
@@ -51,13 +49,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_absval_op()
 {
     struct method m;
@@ -66,11 +62,9 @@ int register_absval_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_ABSVAL, OP_ABSVAL_NAME, &m);
 }
 
-
 int unregister_absval_op()
 {
     return unregister_op(OP_ABSVAL, 1);
diff --git a/source/operator/prototype/add_n.c b/source/operator/prototype/add_n.c
index 6b6d48ad5..a33ae1f28 100644
--- a/source/operator/prototype/add_n.c
+++ b/source/operator/prototype/add_n.c
@@ -29,7 +29,6 @@
 
 #include <stddef.h>
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 1;
@@ -37,7 +36,6 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 int register_add_n_op()
 {
     struct method m;
@@ -45,11 +43,9 @@ int register_add_n_op()
     m.init = init_op;
     m.release = NULL;
 
-
     return register_op(OP_ADD_N, OP_ADD_N_NAME, &m);
 }
 
-
 int unregister_add_n_op()
 {
     return unregister_op(OP_ADD_N, 1);
diff --git a/source/operator/prototype/argmax.c b/source/operator/prototype/argmax.c
index e5eabc44d..25733f6f9 100644
--- a/source/operator/prototype/argmax.c
+++ b/source/operator/prototype/argmax.c
@@ -30,14 +30,13 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct argmax_param* argmax_param = ( struct argmax_param* )(node->op.param_mem);
+    struct argmax_param* argmax_param = (struct argmax_param*)(node->op.param_mem);
 
     int axis = argmax_param->axis;
 
@@ -55,7 +54,7 @@ static int infer_shape(struct node* node)
     input->dims[0] = tmp;
     input->dims[3] = 1;
 
-    if (input->dims[0] != 1)    // input 3 keepdimss
+    if (input->dims[0] != 1) // input 3 keepdimss
     {
         for (int i = 0, j = 0; i < 3; i++)
         {
@@ -63,7 +62,7 @@ static int infer_shape(struct node* node)
                 outdims[j++] = input->dims[i];
         }
     }
-    else    // input 2 keepdimss
+    else // input 2 keepdimss
     {
         for (int i = 0, j = 0; i < 4; i++)
             outdims[j++] = input->dims[i];
@@ -85,7 +84,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     struct argmax_param* argmax_param = (struct argmax_param*)sys_malloc(sizeof(struct argmax_param));
@@ -107,13 +105,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_argmax_op()
 {
     struct method m;
@@ -124,7 +120,6 @@ int register_argmax_op()
     return register_op(OP_ARGMAX, OP_ARGMAX_NAME, &m);
 }
 
-
 int unregister_argmax_op()
 {
     return unregister_op(OP_ARGMAX, 1);
diff --git a/source/operator/prototype/argmin.c b/source/operator/prototype/argmin.c
index 670415fe0..bf8844c93 100644
--- a/source/operator/prototype/argmin.c
+++ b/source/operator/prototype/argmin.c
@@ -31,14 +31,13 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct argmin_param* argmin_param = ( struct argmin_param* )(node->op.param_mem);
+    struct argmin_param* argmin_param = (struct argmin_param*)(node->op.param_mem);
 
     int axis = argmin_param->axis;
 
@@ -56,7 +55,7 @@ static int infer_shape(struct node* node)
     input->dims[0] = tmp;
     input->dims[3] = 1;
 
-    if (input->dims[0] != 1)    // input 3 keepdimss
+    if (input->dims[0] != 1) // input 3 keepdimss
     {
         for (int i = 0, j = 0; i < 3; i++)
         {
@@ -64,7 +63,7 @@ static int infer_shape(struct node* node)
                 outdims[j++] = input->dims[i];
         }
     }
-    else    // input 2 keepdimss
+    else // input 2 keepdimss
     {
         for (int i = 0, j = 0; i < 4; i++)
             outdims[j++] = input->dims[i];
@@ -86,10 +85,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct argmin_param* argmin_param = ( struct argmin_param* )sys_malloc(sizeof(struct argmin_param));
+    struct argmin_param* argmin_param = (struct argmin_param*)sys_malloc(sizeof(struct argmin_param));
 
     if (argmin_param == NULL)
     {
@@ -108,13 +106,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_argmin_op()
 {
     struct method m;
@@ -125,7 +121,6 @@ int register_argmin_op()
     return register_op(OP_ARGMIN, OP_ARGMIN_NAME, &m);
 }
 
-
 int unregister_argmin_op()
 {
     return unregister_op(OP_ARGMIN, 1);
diff --git a/source/operator/prototype/batchnorm.c b/source/operator/prototype/batchnorm.c
index f905fed0a..888844873 100644
--- a/source/operator/prototype/batchnorm.c
+++ b/source/operator/prototype/batchnorm.c
@@ -30,7 +30,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -40,10 +39,9 @@ static int infer_shape(struct node* node)
     return set_ir_tensor_shape(output, input->dims, input->dim_num);
 }
 
-
 static int init_op(struct op* op)
 {
-    batchnorm_param_t* batchnorm_param = ( batchnorm_param_t* )sys_malloc(sizeof(batchnorm_param_t));
+    batchnorm_param_t* batchnorm_param = (batchnorm_param_t*)sys_malloc(sizeof(batchnorm_param_t));
 
     if (batchnorm_param == NULL)
     {
@@ -62,13 +60,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_batchnorm_op()
 {
     struct method m;
@@ -77,7 +73,6 @@ int register_batchnorm_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_BATCHNORM, OP_BATCHNORM_NAME, &m);
 }
 
diff --git a/source/operator/prototype/batchtospacend.c b/source/operator/prototype/batchtospacend.c
index bced4cb27..e307de16d 100644
--- a/source/operator/prototype/batchtospacend.c
+++ b/source/operator/prototype/batchtospacend.c
@@ -30,22 +30,19 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct batchtospacend_param* batchtospacend_param = ( struct batchtospacend_param* )(node->op.param_mem);
+    struct batchtospacend_param* batchtospacend_param = (struct batchtospacend_param*)(node->op.param_mem);
 
     int out_dim[4];
 
     out_dim[0] = input->dims[0] / (batchtospacend_param->dilation_x * batchtospacend_param->dilation_y);
-    out_dim[1] = input->dims[1] * batchtospacend_param->dilation_y - batchtospacend_param->crop_top -
-                 batchtospacend_param->crop_bottom;
-    out_dim[2] = input->dims[2] * batchtospacend_param->dilation_x - batchtospacend_param->crop_left -
-                 batchtospacend_param->crop_right;
+    out_dim[1] = input->dims[1] * batchtospacend_param->dilation_y - batchtospacend_param->crop_top - batchtospacend_param->crop_bottom;
+    out_dim[2] = input->dims[2] * batchtospacend_param->dilation_x - batchtospacend_param->crop_left - batchtospacend_param->crop_right;
     out_dim[3] = input->dims[3];
 
     set_ir_tensor_shape(output, out_dim, 4);
@@ -53,11 +50,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct batchtospacend_param* batchtospacend_param =
-        ( struct batchtospacend_param* )sys_malloc(sizeof(struct batchtospacend_param));
+    struct batchtospacend_param* batchtospacend_param = (struct batchtospacend_param*)sys_malloc(sizeof(struct batchtospacend_param));
 
     if (batchtospacend_param == NULL)
     {
@@ -80,13 +75,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_batchtospacend_op()
 {
     struct method m;
@@ -95,11 +88,9 @@ int register_batchtospacend_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_BATCHTOSPACEND, OP_BATCHTOSPACEND_NAME, &m);
 }
 
-
 int unregister_batchtospacend_op()
 {
     return unregister_op(OP_BATCHTOSPACEND, 1);
diff --git a/source/operator/prototype/bias.c b/source/operator/prototype/bias.c
index f2b9ed6e2..f2b0b01b9 100644
--- a/source/operator/prototype/bias.c
+++ b/source/operator/prototype/bias.c
@@ -27,7 +27,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -39,7 +38,6 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
     op->same_shape = 0;
@@ -48,9 +46,9 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
-static void release_op(ir_op_t* op) {}
-
+static void release_op(ir_op_t* op)
+{
+}
 
 int register_bias_op()
 {
@@ -63,7 +61,6 @@ int register_bias_op()
     return register_op(OP_BIAS, OP_BIAS_NAME, &m);
 }
 
-
 int unregister_bias_op()
 {
     return unregister_op(OP_BIAS, 1);
diff --git a/source/operator/prototype/broadmul.c b/source/operator/prototype/broadmul.c
index 26012172e..42272e19b 100644
--- a/source/operator/prototype/broadmul.c
+++ b/source/operator/prototype/broadmul.c
@@ -28,7 +28,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -40,7 +39,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     /*set the param default value */
@@ -52,9 +50,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
-static void release_op(struct op* op) {}
-
+static void release_op(struct op* op)
+{
+}
 
 int register_broadmul_op()
 {
@@ -67,7 +65,6 @@ int register_broadmul_op()
     return register_op(OP_BROADMUL, OP_BROADMUL_NAME, &m);
 }
 
-
 int unregister_broadmul_op()
 {
     return unregister_op(OP_BROADMUL, 1);
diff --git a/source/operator/prototype/cast.c b/source/operator/prototype/cast.c
index 87b7311f4..87b480440 100644
--- a/source/operator/prototype/cast.c
+++ b/source/operator/prototype/cast.c
@@ -29,7 +29,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int init_op(ir_op_t* op)
 {
     struct cast_param* cast_param = (struct cast_param*)sys_malloc(sizeof(struct cast_param));
@@ -50,13 +49,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_cast_op()
 {
     ir_method_t m;
@@ -68,7 +65,6 @@ int register_cast_op()
     return register_op(OP_CAST, OP_CAST_NAME, &m);
 }
 
-
 int unregister_cast_op()
 {
     return unregister_op(OP_CAST, 1);
diff --git a/source/operator/prototype/ceil.c b/source/operator/prototype/ceil.c
index 2a894afd9..f28c58416 100644
--- a/source/operator/prototype/ceil.c
+++ b/source/operator/prototype/ceil.c
@@ -27,7 +27,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -39,7 +38,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -48,8 +46,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-static void release_op(struct op* op) {}
-
+static void release_op(struct op* op)
+{
+}
 
 int register_ceil_op()
 {
@@ -62,7 +61,6 @@ int register_ceil_op()
     return register_op(OP_CEIL, OP_CEIL_NAME, &m);
 }
 
-
 int unregister_ceil_op()
 {
     return unregister_op(OP_CEIL, 1);
diff --git a/source/operator/prototype/clip.c b/source/operator/prototype/clip.c
index cbcbc94a7..663f0ff7e 100644
--- a/source/operator/prototype/clip.c
+++ b/source/operator/prototype/clip.c
@@ -32,7 +32,6 @@
 
 #include "float.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -44,9 +43,9 @@ static int infer_shape(struct node* node)
         struct tensor* clip_min = get_ir_graph_tensor(ir_graph, node->input_tensors[1]);
         struct tensor* clip_max = get_ir_graph_tensor(ir_graph, node->input_tensors[2]);
 
-        struct clip_param* clip_param = ( struct clip_param* )node->op.param_mem;
-        float* min = (float *)clip_min->data;
-        float* max = (float *)clip_max->data;
+        struct clip_param* clip_param = (struct clip_param*)node->op.param_mem;
+        float* min = (float*)clip_min->data;
+        float* max = (float*)clip_max->data;
         clip_param->min = min[0];
         clip_param->max = max[0];
     }
@@ -58,7 +57,7 @@ static int infer_shape(struct node* node)
 
 static int init_op(struct op* op)
 {
-    struct clip_param* clip_param = ( struct clip_param* )sys_malloc(sizeof(struct clip_param));
+    struct clip_param* clip_param = (struct clip_param*)sys_malloc(sizeof(struct clip_param));
 
     if (clip_param == NULL)
     {
@@ -77,13 +76,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_clip_op()
 {
     struct method m;
@@ -95,7 +92,6 @@ int register_clip_op()
     return register_op(OP_CLIP, OP_CLIP_NAME, &m);
 }
 
-
 int unregister_clip_op()
 {
     return unregister_op(OP_CLIP, 1);
diff --git a/source/operator/prototype/comparison.c b/source/operator/prototype/comparison.c
index 4fc33ab5b..8468c7e3d 100644
--- a/source/operator/prototype/comparison.c
+++ b/source/operator/prototype/comparison.c
@@ -33,7 +33,6 @@
 
 #include <string.h>
 
-
 #define CALC_TENSOR_SHAPE_SIZE(outval, IR_TENSOR)       \
     {                                                   \
         outval = 1;                                     \
@@ -43,7 +42,6 @@
         }                                               \
     }
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -75,10 +73,9 @@ static int infer_shape(struct node* node)
     }
 }
 
-
 static int init_op(struct op* op)
 {
-    struct comparison_param* param = ( struct comparison_param* )sys_malloc(sizeof(struct comparison_param));
+    struct comparison_param* param = (struct comparison_param*)sys_malloc(sizeof(struct comparison_param));
 
     if (param == NULL)
     {
@@ -95,13 +92,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_comparison_op()
 {
     struct method m;
@@ -110,11 +105,9 @@ int register_comparison_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_COMPARISON, OP_COMPARISON_NAME, &m);
 }
 
-
 int unregister_comparison_op()
 {
     return unregister_op(OP_COMPARISON, 1);
diff --git a/source/operator/prototype/concat.c b/source/operator/prototype/concat.c
index 7d8c802b8..478cd797d 100644
--- a/source/operator/prototype/concat.c
+++ b/source/operator/prototype/concat.c
@@ -31,13 +31,12 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* graph = node->graph;
     ir_tensor_t* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct concat_param* concat_param = ( struct concat_param* )(node->op.param_mem);
+    struct concat_param* concat_param = (struct concat_param*)(node->op.param_mem);
 
     int concat_shape = 0;
     int axis = concat_param->axis;
@@ -104,10 +103,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    struct concat_param* concat_param = ( struct concat_param* )sys_malloc(sizeof(struct concat_param));
+    struct concat_param* concat_param = (struct concat_param*)sys_malloc(sizeof(struct concat_param));
 
     if (concat_param == NULL)
     {
@@ -125,13 +123,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_concat_op()
 {
     ir_method_t m;
@@ -143,7 +139,6 @@ int register_concat_op()
     return register_op(OP_CONCAT, OP_CONCAT_NAME, &m);
 }
 
-
 int unregister_concat_op()
 {
     return unregister_op(OP_CONCAT, 1);
diff --git a/source/operator/prototype/const.c b/source/operator/prototype/const.c
index e4d5c8bd7..56d4fd203 100644
--- a/source/operator/prototype/const.c
+++ b/source/operator/prototype/const.c
@@ -27,15 +27,14 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int init_op(ir_op_t* op)
 {
     return 0;
 }
 
-
-static void release_op(ir_op_t* op) {}
-
+static void release_op(ir_op_t* op)
+{
+}
 
 int register_const_op()
 {
@@ -45,10 +44,9 @@ int register_const_op()
     m.init = init_op;
     m.release = release_op;
 
-    return register_op(OP_CONST, OP_CONST_NAME , &m);
+    return register_op(OP_CONST, OP_CONST_NAME, &m);
 }
 
-
 int unregister_const_op()
 {
     return unregister_op(OP_CONST, 1);
diff --git a/source/operator/prototype/convolution.c b/source/operator/prototype/convolution.c
index 5bada8581..9ae31d787 100644
--- a/source/operator/prototype/convolution.c
+++ b/source/operator/prototype/convolution.c
@@ -32,14 +32,13 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* graph = node->graph;
     ir_tensor_t* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     ir_tensor_t* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct conv_param* conv_param = ( struct conv_param* )(node->op.param_mem);
+    struct conv_param* conv_param = (struct conv_param*)(node->op.param_mem);
 
     int n = input->dims[0];
     int h, w;
@@ -97,8 +96,7 @@ static int infer_shape(ir_node_t* node)
     }
     else
     {
-        out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) /
-                conv_param->stride_h + 1;
+        out_h = (h - conv_param->dilation_h * (conv_param->kernel_h - 1) - 1 + conv_param->pad_h0 + conv_param->pad_h1) / conv_param->stride_h + 1;
     }
 
     if (conv_param->pad_w0 < 0)
@@ -122,8 +120,7 @@ static int infer_shape(ir_node_t* node)
     }
     else
     {
-        out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) /
-                conv_param->stride_w + 1;
+        out_w = (w - conv_param->dilation_w * (conv_param->kernel_w - 1) - 1 + conv_param->pad_w0 + conv_param->pad_w1) / conv_param->stride_w + 1;
     }
 
     int dims[4];
@@ -133,7 +130,7 @@ static int infer_shape(ir_node_t* node)
     dims[2] = out_h;
     dims[3] = out_w;
 
-    for (int i=0; i<4; i++)
+    for (int i = 0; i < 4; i++)
     {
         if (dims[i] == 0)
         {
@@ -146,10 +143,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    struct conv_param* conv_param = ( struct conv_param* )sys_malloc(sizeof(struct conv_param));
+    struct conv_param* conv_param = (struct conv_param*)sys_malloc(sizeof(struct conv_param));
 
     if (conv_param == NULL)
     {
@@ -180,13 +176,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_convolution_op()
 {
     ir_method_t m;
@@ -198,7 +192,6 @@ int register_convolution_op()
     return register_op(OP_CONV, OP_CONV_NAME, &m);
 }
 
-
 int unregister_convolution_op()
 {
     return unregister_op(OP_CONV, 1);
diff --git a/source/operator/prototype/convolution_param.h b/source/operator/prototype/convolution_param.h
index 22fb7e34c..602e6d135 100644
--- a/source/operator/prototype/convolution_param.h
+++ b/source/operator/prototype/convolution_param.h
@@ -46,22 +46,22 @@ struct conv_param
 
 struct conv_priv_info
 {
-    void* interleave_buffer;    // kernel transform buffer
-    void* interleave_buffer_pack4;    // kernel pack4
-    void* im2col_buffer;    // input data transform buffer
-    void* im2col_buffer_pack4;    // input data transform buffer pack4
+    void* interleave_buffer;       // kernel transform buffer
+    void* interleave_buffer_pack4; // kernel pack4
+    void* im2col_buffer;           // input data transform buffer
+    void* im2col_buffer_pack4;     // input data transform buffer pack4
     void* input_pad;
     void* dot_block;
     void* transform_input;
     void* output_bordered;
-    int im2col_buffer_size;    // kernel transform buffer size
-    int im2col_buffer_pack4_size;    // kernel transform buffer size
-    int interleave_buffer_size;    // input data transform buffer size
+    int im2col_buffer_size;       // kernel transform buffer size
+    int im2col_buffer_pack4_size; // kernel transform buffer size
+    int interleave_buffer_size;   // input data transform buffer size
     int interleave_buffer_pack4_size;
-    int external_im2col_mem;    // flag
-    int external_im2col_pack4_mem;    // flag
-    int external_interleave_mem;    // flag
-    int external_interleave_pack4_mem;    // flag
+    int external_im2col_mem;           // flag
+    int external_im2col_pack4_mem;     // flag
+    int external_interleave_mem;       // flag
+    int external_interleave_pack4_mem; // flag
     int cpu_type;
     int winograd;
     int wino_off;
diff --git a/source/operator/prototype/crop.c b/source/operator/prototype/crop.c
index 4cab29e09..0a1fb106f 100644
--- a/source/operator/prototype/crop.c
+++ b/source/operator/prototype/crop.c
@@ -31,13 +31,12 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[1]); // Don't try to modify !
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    struct crop_param* crop_param = ( struct crop_param* )(node->op.param_mem);
+    struct crop_param* crop_param = (struct crop_param*)(node->op.param_mem);
 
     int input_h = input->dims[2];
     int input_w = input->dims[3];
@@ -78,10 +77,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct crop_param* crop_param = ( struct crop_param* )sys_malloc(sizeof(struct crop_param));
+    struct crop_param* crop_param = (struct crop_param*)sys_malloc(sizeof(struct crop_param));
 
     if (crop_param == NULL)
     {
@@ -107,13 +105,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_crop_op()
 {
     struct method m;
@@ -122,11 +118,9 @@ int register_crop_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_CROP, OP_CROP_NAME, &m);
 }
 
-
 int unregister_crop_op()
 {
     return unregister_op(OP_CROP, 1);
diff --git a/source/operator/prototype/deconvolution.c b/source/operator/prototype/deconvolution.c
index 3257a1e74..a030ee506 100644
--- a/source/operator/prototype/deconvolution.c
+++ b/source/operator/prototype/deconvolution.c
@@ -32,14 +32,13 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct deconv_param* deconv_param = ( struct deconv_param* )(node->op.param_mem);
+    struct deconv_param* deconv_param = (struct deconv_param*)(node->op.param_mem);
 
     int n = input->dims[0];
     int h, w;
@@ -99,7 +98,7 @@ static int infer_shape(struct node* node)
 
 static int init_op(struct op* op)
 {
-    struct deconv_param* deconv_param = ( struct deconv_param* )sys_malloc(sizeof(struct deconv_param));
+    struct deconv_param* deconv_param = (struct deconv_param*)sys_malloc(sizeof(struct deconv_param));
 
     if (deconv_param == NULL)
     {
@@ -144,7 +143,6 @@ int register_deconvolution_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_DECONV, OP_DECONV_NAME, &m);
 }
 
diff --git a/source/operator/prototype/depthtospace.c b/source/operator/prototype/depthtospace.c
index 424e42236..3af00fdd1 100644
--- a/source/operator/prototype/depthtospace.c
+++ b/source/operator/prototype/depthtospace.c
@@ -32,34 +32,31 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct depthtospace_param* depthtospace_param = ( struct depthtospace_param* )(node->op.param_mem);
+    struct depthtospace_param* depthtospace_param = (struct depthtospace_param*)(node->op.param_mem);
 
     /* todo reshape */
     int dims[4];
     int block_size = depthtospace_param->block_size;
 
-    dims[0] = input->dims[0];    // batch
-    dims[1] = input->dims[1] / (block_size * block_size);    // channel
-    dims[2] = input->dims[2] * block_size;    // height
-    dims[3] = input->dims[3] * block_size;    // width
+    dims[0] = input->dims[0];                             // batch
+    dims[1] = input->dims[1] / (block_size * block_size); // channel
+    dims[2] = input->dims[2] * block_size;                // height
+    dims[3] = input->dims[3] * block_size;                // width
 
     set_ir_tensor_shape(output, dims, 4);
 
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct depthtospace_param* depthtospace_param =
-        ( struct depthtospace_param* )sys_malloc(sizeof(struct depthtospace_param));
+    struct depthtospace_param* depthtospace_param = (struct depthtospace_param*)sys_malloc(sizeof(struct depthtospace_param));
 
     if (depthtospace_param == NULL)
     {
@@ -77,13 +74,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_depthtospace_op()
 {
     struct method m;
@@ -92,11 +87,9 @@ int register_depthtospace_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_DEPTHTOSPACE, OP_DEPTHTOSPACE_NAME, &m);
 }
 
-
 int unregister_depthtospace_op()
 {
     return unregister_op(OP_DEPTHTOSPACE, 1);
diff --git a/source/operator/prototype/detection_output.c b/source/operator/prototype/detection_output.c
index 05a02a5c2..cc49e3028 100644
--- a/source/operator/prototype/detection_output.c
+++ b/source/operator/prototype/detection_output.c
@@ -31,13 +31,12 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct detection_output_param* param = ( struct detection_output_param* )node->op.param_mem;
+    struct detection_output_param* param = (struct detection_output_param*)node->op.param_mem;
 
     int dims[TE_MAX_SHAPE_DIM_NUM] = {0};
 
@@ -52,11 +51,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct detection_output_param* detection_output_param =
-        ( struct detection_output_param* )sys_malloc(sizeof(struct detection_output_param));
+    struct detection_output_param* detection_output_param = (struct detection_output_param*)sys_malloc(sizeof(struct detection_output_param));
 
     if (detection_output_param == NULL)
     {
@@ -77,13 +74,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_detection_output_op()
 {
     struct method m;
@@ -92,11 +87,9 @@ int register_detection_output_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_DETECTION_OUTPUT, OP_DETECTION_OUTPUT_NAME, &m);
 }
 
-
 int unregister_detection_output_op()
 {
     return unregister_op(OP_DETECTION_OUTPUT, 1);
diff --git a/source/operator/prototype/detection_postprocess.c b/source/operator/prototype/detection_postprocess.c
index a0aed51fb..35c29cf56 100644
--- a/source/operator/prototype/detection_postprocess.c
+++ b/source/operator/prototype/detection_postprocess.c
@@ -32,7 +32,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -44,8 +43,7 @@ static int infer_shape(struct node* node)
     struct tensor* output2 = get_ir_graph_tensor(ir_graph, node->output_tensors[2]);
     struct tensor* output3 = get_ir_graph_tensor(ir_graph, node->output_tensors[3]);
 
-    struct detection_postprocess_param* detection_postprocess_param =
-        ( struct detection_postprocess_param* )(node->op.param_mem);
+    struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)(node->op.param_mem);
     int max_detections = detection_postprocess_param->max_detections;
     int max_classes_per_detection = detection_postprocess_param->max_classes_per_detection;
     int num_classes = detection_postprocess_param->num_classes;
@@ -54,8 +52,7 @@ static int infer_shape(struct node* node)
     int* in_dim2 = &input1->dims[TE_MAX_SHAPE_DIM_NUM];
 
     // Only support: batch_size == 1 && num_coord == 4
-    if (input0->dims[0] != 1 || input0->dims[1] != 4 || input1->dims[0] != 1 || input1->dims[2] != input0->dims[2] ||
-        input1->dims[1] != num_classes + 1)
+    if (input0->dims[0] != 1 || input0->dims[1] != 4 || input1->dims[0] != 1 || input1->dims[2] != input0->dims[2] || input1->dims[1] != num_classes + 1)
     {
         TLOG_ERR("Not Support.\n");
         return -1;
@@ -73,11 +70,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct detection_postprocess_param* detection_postprocess_param =
-        ( struct detection_postprocess_param* )sys_malloc(sizeof(struct detection_postprocess_param));
+    struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)sys_malloc(sizeof(struct detection_postprocess_param));
 
     if (detection_postprocess_param == NULL)
     {
@@ -94,11 +89,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
-    struct detection_postprocess_param* detection_postprocess_param =
-        ( struct detection_postprocess_param* )op->param_mem;
+    struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)op->param_mem;
 
     if (detection_postprocess_param->scales)
         sys_free(detection_postprocess_param->scales);
@@ -106,7 +99,6 @@ static void release_op(struct op* op)
     sys_free(op->param_mem);
 }
 
-
 int register_detection_postprocess_op()
 {
     struct method m;
@@ -118,7 +110,6 @@ int register_detection_postprocess_op()
     return register_op(OP_DETECTION_POSTPROCESS, OP_DETECTION_POSTPROCESS_NAME, &m);
 }
 
-
 int unregister_detection_postprocess_op()
 {
     return unregister_op(OP_DETECTION_POSTPROCESS, 1);
diff --git a/source/operator/prototype/detection_postprocess_param.h b/source/operator/prototype/detection_postprocess_param.h
index 77a751071..3c53cc022 100644
--- a/source/operator/prototype/detection_postprocess_param.h
+++ b/source/operator/prototype/detection_postprocess_param.h
@@ -32,7 +32,7 @@ struct detection_postprocess_param
     float nms_score_threshold;
     float nms_iou_threshold;
     int num_classes;
-    float* scales;    // y_scale, x_scale, h_scale, w_scale
+    float* scales; // y_scale, x_scale, h_scale, w_scale
 };
 
 #endif
diff --git a/source/operator/prototype/dropout.c b/source/operator/prototype/dropout.c
index 348a8666a..4fda96805 100644
--- a/source/operator/prototype/dropout.c
+++ b/source/operator/prototype/dropout.c
@@ -28,7 +28,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 1;
@@ -37,9 +36,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
-static void release_op(struct op* op) {}
-
+static void release_op(struct op* op)
+{
+}
 
 int register_dropout_op()
 {
@@ -52,7 +51,6 @@ int register_dropout_op()
     return register_op(OP_DROPOUT, OP_DROPOUT_NAME, &m);
 }
 
-
 int unregister_dropout_op()
 {
     return unregister_op(OP_DROPOUT, 1);
diff --git a/source/operator/prototype/eltwise.c b/source/operator/prototype/eltwise.c
index 8266b7c00..4288b8935 100644
--- a/source/operator/prototype/eltwise.c
+++ b/source/operator/prototype/eltwise.c
@@ -34,14 +34,13 @@
 
 #include <string.h>
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input0 = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct eltwise_param* eltwise_param = ( struct eltwise_param* )(node->op.param_mem);
+    struct eltwise_param* eltwise_param = (struct eltwise_param*)(node->op.param_mem);
 
     if (node->input_num == 1)
     {
@@ -77,10 +76,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct eltwise_param* eltwise_param = ( struct eltwise_param* )sys_malloc(sizeof(struct eltwise_param));
+    struct eltwise_param* eltwise_param = (struct eltwise_param*)sys_malloc(sizeof(struct eltwise_param));
 
     if (eltwise_param == NULL)
     {
@@ -98,13 +96,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_eltwise_op()
 {
     struct method m;
@@ -113,11 +109,9 @@ int register_eltwise_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_ELTWISE, OP_ELTWISE_NAME, &m);
 }
 
-
 int unregister_eltwise_op()
 {
     return unregister_op(OP_ELTWISE, 1);
diff --git a/source/operator/prototype/elu.c b/source/operator/prototype/elu.c
index 23ad7d4bd..8f0698983 100644
--- a/source/operator/prototype/elu.c
+++ b/source/operator/prototype/elu.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -43,10 +42,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    struct elu_param* elu_param = ( struct elu_param* )sys_malloc(sizeof(struct elu_param));
+    struct elu_param* elu_param = (struct elu_param*)sys_malloc(sizeof(struct elu_param));
 
     if (elu_param == NULL)
     {
@@ -64,13 +62,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_elu_op()
 {
     ir_method_t m;
@@ -82,7 +78,6 @@ int register_elu_op()
     return register_op(OP_ELU, OP_ELU_NAME, &m);
 }
 
-
 int unregister_elu_op()
 {
     return unregister_op(OP_ELU, 1);
diff --git a/source/operator/prototype/embedding.c b/source/operator/prototype/embedding.c
index b87b08b89..a8db23069 100644
--- a/source/operator/prototype/embedding.c
+++ b/source/operator/prototype/embedding.c
@@ -32,7 +32,6 @@
 
 #include <string.h>
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -46,7 +45,7 @@ static int infer_shape(struct node* node)
         dims[0] *= input->dims[ii];
     }
 
-    struct embedding_param* param = ( struct embedding_param* )node->op.param_mem;
+    struct embedding_param* param = (struct embedding_param*)node->op.param_mem;
 
     dims[1] = param->num_output;
 
@@ -55,10 +54,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct embedding_param* param = ( struct embedding_param* )sys_malloc(sizeof(struct embedding_param));
+    struct embedding_param* param = (struct embedding_param*)sys_malloc(sizeof(struct embedding_param));
 
     if (param == NULL)
     {
@@ -75,13 +73,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_embedding_op()
 {
     struct method m;
@@ -90,11 +86,9 @@ int register_embedding_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_EMBEDDING, OP_EMBEDDING_NAME, &m);
 }
 
-
 int unregister_embedding_op()
 {
     return unregister_op(OP_EMBEDDING, 1);
diff --git a/source/operator/prototype/embedding_param.h b/source/operator/prototype/embedding_param.h
index 3489e9f46..d2f268375 100644
--- a/source/operator/prototype/embedding_param.h
+++ b/source/operator/prototype/embedding_param.h
@@ -28,7 +28,7 @@ struct embedding_param
 {
     int num_output;
     int input_dim;
-    int bias_term;    // if use bias
+    int bias_term; // if use bias
     int weight_data_size;
 };
 
diff --git a/source/operator/prototype/expand.c b/source/operator/prototype/expand.c
index de22eb949..521c86173 100644
--- a/source/operator/prototype/expand.c
+++ b/source/operator/prototype/expand.c
@@ -40,8 +40,8 @@ static int infer_shape(struct node* node)
     struct vector* dims = create_vector(sizeof(int), NULL);
     struct vector* dims1 = create_vector(sizeof(int), NULL);
     struct vector* dims2 = create_vector(sizeof(int), NULL);
-    
-    expand_param_t* param = ( struct expand_param* )(node->op.param_mem);
+
+    expand_param_t* param = (struct expand_param*)(node->op.param_mem);
 
     struct graph* graph = node->graph;
     struct tensor* input1 = get_ir_graph_tensor(graph, node->input_tensors[0]);
@@ -49,82 +49,86 @@ static int infer_shape(struct node* node)
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
     int flag = 1;
-    int32_t * input2_data = (int32_t*)input2->data;
-    for(int i = 0; i < input2->elem_num; i++)
+    int32_t* input2_data = (int32_t*)input2->data;
+    for (int i = 0; i < input2->elem_num; i++)
     {
-        if(input2_data[i] == 0){
+        if (input2_data[i] == 0)
+        {
             flag = 0;
         }
     }
 
-    if(flag == 1)
+    if (flag == 1)
     {
-        for(int i = 0; i < input2->elem_num; i++)
+        for (int i = 0; i < input2->elem_num; i++)
             param->ex_shape[i] = input2_data[i];
     }
-    
-    for(int i = 0; i < (int)param->dim_num; i++)
+
+    for (int i = 0; i < (int)param->dim_num; i++)
     {
         int temp = param->ex_shape[i];
         push_vector_data(dims2, (void*)&temp);
     }
     int num = get_vector_num(dims2);
 
-
     int input1_dim_size = input1->dim_num;
     int input2_dim_size = param->dim_num;
-    
-    if(input1_dim_size == input2_dim_size)
+
+    if (input1_dim_size == input2_dim_size)
     {
-        for(int i = 0; i < input2_dim_size; i++)
+        for (int i = 0; i < input2_dim_size; i++)
         {
-            if(input1->dims[i] >= param->ex_shape[i])
+            if (input1->dims[i] >= param->ex_shape[i])
             {
                 int temp = input1->dims[i];
                 push_vector_data(dims, (void*)&temp);
-            } 
+            }
             else
             {
                 int temp = param->ex_shape[i];
                 push_vector_data(dims, (void*)&temp);
             }
         }
-    } else {
+    }
+    else
+    {
         int diff = fabs(input1_dim_size - input2_dim_size);
-        if(input1_dim_size > input2_dim_size)
+        if (input1_dim_size > input2_dim_size)
         {
-            for(int i = 0; i < input1_dim_size; i++)
+            for (int i = 0; i < input1_dim_size; i++)
             {
                 int temp = input1->dims[i];
                 push_vector_data(dims, (void*)&temp);
             }
-            for(int i = 0; i < input1_dim_size - diff; i++)
+            for (int i = 0; i < input1_dim_size - diff; i++)
             {
-                if(input1->dims[i+diff] > param->ex_shape[i])
+                if (input1->dims[i + diff] > param->ex_shape[i])
                 {
-                    int temp = input1->dims[i+diff];
+                    int temp = input1->dims[i + diff];
                     push_vector_data(dims, (void*)&temp);
-                } 
-                else 
+                }
+                else
                 {
                     int temp = param->ex_shape[i];
                     push_vector_data(dims, (void*)&temp);
                 }
             }
-        } else {
-            for(int i = 0; i < input2_dim_size; i++)
+        }
+        else
+        {
+            for (int i = 0; i < input2_dim_size; i++)
             {
                 int temp = param->ex_shape[i];
                 push_vector_data(dims, (void*)&temp);
             }
-            for(int i = 0; i < input2_dim_size - diff; i++)
+            for (int i = 0; i < input2_dim_size - diff; i++)
             {
-                if(param->ex_shape[i+diff] > input1->dims[i])
+                if (param->ex_shape[i + diff] > input1->dims[i])
                 {
-                    int temp = param->ex_shape[i+diff];
+                    int temp = param->ex_shape[i + diff];
                     push_vector_data(dims, (void*)&temp);
-                } 
-                else 
+                }
+                else
                 {
                     int temp = input1->dims[i];
                     push_vector_data(dims, (void*)&temp);
@@ -133,8 +137,8 @@ static int infer_shape(struct node* node)
         }
     }
     int new_size = 1;
-    int* new_shape_temp = (int*)sys_malloc(get_vector_num(dims)*sizeof(int));
-    for(int i = 0; i < get_vector_num(dims); i++)
+    int* new_shape_temp = (int*)sys_malloc(get_vector_num(dims) * sizeof(int));
+    for (int i = 0; i < get_vector_num(dims); i++)
     {
         int* a = (int*)get_vector_data(dims, i);
         new_shape_temp[i] = *a;
@@ -150,10 +154,9 @@ static int infer_shape(struct node* node)
     return ret;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct expand_param* expand_param = ( struct expand_param* )sys_malloc(sizeof(struct expand_param));
+    struct expand_param* expand_param = (struct expand_param*)sys_malloc(sizeof(struct expand_param));
 
     if (expand_param == NULL)
     {
@@ -170,10 +173,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
-    struct expand_param* expand_param = ( struct expand_param* )op->param_mem;
+    struct expand_param* expand_param = (struct expand_param*)op->param_mem;
 
     if (expand_param->ex_shape)
         sys_free(expand_param->ex_shape);
@@ -181,7 +183,6 @@ static void release_op(struct op* op)
     sys_free(op->param_mem);
 }
 
-
 int register_expand_op()
 {
     struct method m;
@@ -193,7 +194,6 @@ int register_expand_op()
     return register_op(OP_EXPAND, OP_EXPAND_NAME, &m);
 }
 
-
 int unregister_expand_op()
 {
     return unregister_op(OP_EXPAND, 1);
diff --git a/source/operator/prototype/expanddims.c b/source/operator/prototype/expanddims.c
index 7b56952b3..c488aa84c 100644
--- a/source/operator/prototype/expanddims.c
+++ b/source/operator/prototype/expanddims.c
@@ -31,14 +31,13 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct expanddims_param* expanddims_param = ( struct expanddims_param* )(node->op.param_mem);
+    struct expanddims_param* expanddims_param = (struct expanddims_param*)(node->op.param_mem);
 
     int axis = expanddims_param->axis;
     int in_size = input->dim_num;
@@ -66,10 +65,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct expanddims_param* expanddims_param = ( struct expanddims_param* )sys_malloc(sizeof(struct expanddims_param));
+    struct expanddims_param* expanddims_param = (struct expanddims_param*)sys_malloc(sizeof(struct expanddims_param));
 
     if (expanddims_param == NULL)
     {
@@ -87,13 +85,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_expanddims_op()
 {
     struct method m;
@@ -105,7 +101,6 @@ int register_expanddims_op()
     return register_op(OP_EXPANDDIMS, OP_EXPANDDIMS_NAME, &m);
 }
 
-
 int unregister_expanddims_op()
 {
     return unregister_op(OP_EXPANDDIMS, 1);
diff --git a/source/operator/prototype/fc.c b/source/operator/prototype/fc.c
index c96860578..4c4fc3c9f 100644
--- a/source/operator/prototype/fc.c
+++ b/source/operator/prototype/fc.c
@@ -32,7 +32,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* graph = node->graph;
@@ -105,7 +104,7 @@ static int infer_shape(ir_node_t* node)
 
 static int init_op(ir_op_t* op)
 {
-    struct fc_param* fc_param = ( struct fc_param* )sys_malloc(sizeof(struct fc_param));
+    struct fc_param* fc_param = (struct fc_param*)sys_malloc(sizeof(struct fc_param));
 
     if (fc_param == NULL)
     {
diff --git a/source/operator/prototype/flatten.c b/source/operator/prototype/flatten.c
index 9bfbc2777..5354dfce2 100644
--- a/source/operator/prototype/flatten.c
+++ b/source/operator/prototype/flatten.c
@@ -31,14 +31,13 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct flatten_param* flatten_param = ( struct flatten_param* )(node->op.param_mem);
+    struct flatten_param* flatten_param = (struct flatten_param*)(node->op.param_mem);
 
     int new_channel = 1;
     for (int i = flatten_param->axis; i <= flatten_param->end_axis && i < input->dim_num; i++)
@@ -59,10 +58,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct flatten_param* flatten_param = ( struct flatten_param* )sys_malloc(sizeof(struct flatten_param));
+    struct flatten_param* flatten_param = (struct flatten_param*)sys_malloc(sizeof(struct flatten_param));
 
     if (flatten_param == NULL)
     {
@@ -81,13 +79,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_flatten_op()
 {
     struct method m;
@@ -96,11 +92,9 @@ int register_flatten_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_FLATTEN, OP_FLATTEN_NAME, &m);
 }
 
-
 int unregister_flatten_op()
 {
     return unregister_op(OP_FLATTEN, 1);
diff --git a/source/operator/prototype/gather.c b/source/operator/prototype/gather.c
index 4dd72a5b5..0027a57c4 100644
--- a/source/operator/prototype/gather.c
+++ b/source/operator/prototype/gather.c
@@ -32,43 +32,42 @@
 #include "utility/sys_port.h"
 #include "utility/vector.h"
 
-
 static int infer_shape(struct node* node)
 {
-    struct graph* graph   = node->graph;
-    struct tensor* input  = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct graph* graph = node->graph;
+    struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct gather_param* _param = ( struct gather_param* )(node->op.param_mem);
-  
+    struct gather_param* _param = (struct gather_param*)(node->op.param_mem);
+
     int indices_size = _param->indices_num;
-    
+
     struct vector* new_shape_temp = create_vector(sizeof(int), NULL);
-    if(_param->is_onnx)
+    if (_param->is_onnx)
     {
-        if(_param->axis == 0)
+        if (_param->axis == 0)
         {
-            for(int i = 0; i < input->dim_num  - 1; i++)
+            for (int i = 0; i < input->dim_num - 1; i++)
             {
-                push_vector_data(new_shape_temp, (void* )&input->dims[i+1]);
+                push_vector_data(new_shape_temp, (void*)&input->dims[i + 1]);
             }
         }
         else
         {
-            for(int i = 0; i < input->dim_num; i++)
+            for (int i = 0; i < input->dim_num; i++)
             {
-                if(i == _param->axis)
-                    push_vector_data(new_shape_temp, (void* )&indices_size);
+                if (i == _param->axis)
+                    push_vector_data(new_shape_temp, (void*)&indices_size);
                 else
-                    push_vector_data(new_shape_temp, (void* )&input->dims[i]);
+                    push_vector_data(new_shape_temp, (void*)&input->dims[i]);
             }
         }
 
-        int* shape_temp = (int *)sys_malloc(get_vector_num(new_shape_temp) * sizeof(int));
+        int* shape_temp = (int*)sys_malloc(get_vector_num(new_shape_temp) * sizeof(int));
 
-        for (int i=0; i<get_vector_num(new_shape_temp); i++)
+        for (int i = 0; i < get_vector_num(new_shape_temp); i++)
         {
-            int* a = (int* )get_vector_data(new_shape_temp, i);
+            int* a = (int*)get_vector_data(new_shape_temp, i);
             shape_temp[i] = *a;
         }
         set_ir_tensor_shape(output, shape_temp, get_vector_num(new_shape_temp));
@@ -76,29 +75,28 @@ static int infer_shape(struct node* node)
     }
     else
     {
-        int dims[4] ;
+        int dims[4];
         dims[0] = input->dims[0];
         dims[1] = input->dims[1];
         dims[2] = input->dims[2];
         dims[3] = input->dims[3];
 
-        if( _param->axis > ( int )input->dim_num) 
+        if (_param->axis > (int)input->dim_num)
         {
             return -1;
-        } 
-        dims[_param->axis] = indices_size; 
+        }
+        dims[_param->axis] = indices_size;
         set_ir_tensor_shape(output, dims, 4);
     }
-    
+
     release_vector(new_shape_temp);
 
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct gather_param* gather_param = ( struct gather_param* )sys_malloc(sizeof(struct gather_param));
+    struct gather_param* gather_param = (struct gather_param*)sys_malloc(sizeof(struct gather_param));
 
     if (gather_param == NULL)
     {
@@ -117,13 +115,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_gather_op()
 {
     struct method m;
@@ -132,11 +128,9 @@ int register_gather_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_GATHER, OP_GATHER_NAME, &m);
 }
 
-
 int unregister_gather_op()
 {
     return unregister_op(OP_GATHER, 1);
diff --git a/source/operator/prototype/gemm.c b/source/operator/prototype/gemm.c
index f1b299f54..5bb0151e8 100644
--- a/source/operator/prototype/gemm.c
+++ b/source/operator/prototype/gemm.c
@@ -32,7 +32,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -40,7 +39,7 @@ static int infer_shape(struct node* node)
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
     struct tensor* weight = get_ir_graph_tensor(graph, node->input_tensors[1]);
 
-    struct gemm_param* gemm_param = ( struct gemm_param* )(node->op.param_mem);
+    struct gemm_param* gemm_param = (struct gemm_param*)(node->op.param_mem);
 
     int dims[2];
     if (gemm_param->transA)
@@ -58,16 +57,15 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct gemm_param* gemm_param = ( struct gemm_param* )sys_malloc(sizeof(struct gemm_param));
+    struct gemm_param* gemm_param = (struct gemm_param*)sys_malloc(sizeof(struct gemm_param));
 
     if (gemm_param == NULL)
     {
         return -1;
     }
-    
+
     /*set the param default value */
     gemm_param->transA = 0;
     gemm_param->transB = 0;
@@ -80,13 +78,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_gemm_op()
 {
     struct method m;
@@ -98,7 +94,6 @@ int register_gemm_op()
     return register_op(OP_GEMM, OP_GEMM_NAME, &m);
 }
 
-
 int unregister_gemm_op()
 {
     return unregister_op(OP_GEMM, 1);
diff --git a/source/operator/prototype/generic.c b/source/operator/prototype/generic.c
index 2e6b70980..bf504d6d2 100644
--- a/source/operator/prototype/generic.c
+++ b/source/operator/prototype/generic.c
@@ -32,7 +32,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -62,10 +61,9 @@ static int infer_shape(struct node* node)
     return -1;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct generic_param* generic_param = ( struct generic_param* )sys_malloc(sizeof(struct generic_param));
+    struct generic_param* generic_param = (struct generic_param*)sys_malloc(sizeof(struct generic_param));
 
     if (generic_param == NULL)
     {
@@ -80,13 +78,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_generic_op()
 {
     struct method m;
@@ -98,7 +94,6 @@ int register_generic_op()
     return register_op(OP_GENERIC, OP_GENERIC_NAME, &m);
 }
 
-
 int unregister_generic_op()
 {
     return unregister_op(OP_GENERIC, 1);
diff --git a/source/operator/prototype/generic_param.h b/source/operator/prototype/generic_param.h
index 7d04091b8..c63206af8 100644
--- a/source/operator/prototype/generic_param.h
+++ b/source/operator/prototype/generic_param.h
@@ -27,7 +27,7 @@
 
 struct generic_param
 {
-    const char* op_name;    // what real action?
+    const char* op_name; // what real action?
     int max_input_num;
     int max_output_num;
 };
diff --git a/source/operator/prototype/gru.c b/source/operator/prototype/gru.c
index 0b8f5a89d..bfca1de3e 100644
--- a/source/operator/prototype/gru.c
+++ b/source/operator/prototype/gru.c
@@ -32,14 +32,13 @@
 #include "utility/sys_port.h"
 #include "utility/vector.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* weight = get_ir_graph_tensor(ir_graph, node->input_tensors[1]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct gru_param* gru_param = ( struct gru_param* )(node->op.param_mem);
+    struct gru_param* gru_param = (struct gru_param*)(node->op.param_mem);
     int batch_size = input->dims[1];
     int dims[4];
     dims[0] = input->dims[0];
@@ -51,10 +50,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    gru_param_t* gru_param = ( gru_param_t* )sys_malloc(sizeof(gru_param_t));
+    gru_param_t* gru_param = (gru_param_t*)sys_malloc(sizeof(gru_param_t));
 
     if (gru_param == NULL)
     {
@@ -79,13 +77,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_gru_op()
 {
     struct method m;
@@ -94,11 +90,9 @@ int register_gru_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_GRU, OP_GRU_NAME, &m);
 }
 
-
 int unregister_gru_op()
 {
     return unregister_op(OP_GRU, 1);
diff --git a/source/operator/prototype/gru_param.h b/source/operator/prototype/gru_param.h
index d1ba5266b..ae85273a1 100644
--- a/source/operator/prototype/gru_param.h
+++ b/source/operator/prototype/gru_param.h
@@ -26,7 +26,7 @@
 #define __GRU_PARAM_H__
 
 #define GRU_ACT_SIGMOID 1
-#define GRU_ACT_TANH 2
+#define GRU_ACT_TANH    2
 typedef struct gru_param
 {
     float clip;
diff --git a/source/operator/prototype/hardsigmoid.c b/source/operator/prototype/hardsigmoid.c
index 1f44c4655..77d779a82 100644
--- a/source/operator/prototype/hardsigmoid.c
+++ b/source/operator/prototype/hardsigmoid.c
@@ -32,7 +32,6 @@
 
 #include <string.h>
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -44,10 +43,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct hard_sigmoid_param* param = ( struct hard_sigmoid_param* )sys_malloc(sizeof(struct hard_sigmoid_param));
+    struct hard_sigmoid_param* param = (struct hard_sigmoid_param*)sys_malloc(sizeof(struct hard_sigmoid_param));
 
     if (param == NULL)
     {
@@ -64,13 +62,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_hardsigmoid_op()
 {
     struct method m;
@@ -82,7 +78,6 @@ int register_hardsigmoid_op()
     return register_op(OP_HARDSIGMOID, OP_HARDSIGMOID_NAME, &m);
 }
 
-
 int unregister_hardsigmoid_op()
 {
     return unregister_op(OP_HARDSIGMOID, 1);
diff --git a/source/operator/prototype/hardswish.c b/source/operator/prototype/hardswish.c
index 49819c71b..04216b9dd 100644
--- a/source/operator/prototype/hardswish.c
+++ b/source/operator/prototype/hardswish.c
@@ -32,10 +32,9 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int init_op(struct op* op)
 {
-    struct hardswish_param* hardswish_param = ( struct hardswish_param* )sys_malloc(sizeof(struct hardswish_param));
+    struct hardswish_param* hardswish_param = (struct hardswish_param*)sys_malloc(sizeof(struct hardswish_param));
 
     if (hardswish_param == NULL)
     {
@@ -54,13 +53,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_hardswish_op()
 {
     struct method m;
@@ -72,7 +69,6 @@ int register_hardswish_op()
     return register_op(OP_HARDSWISH, OP_HARDSWISH_NAME, &m);
 }
 
-
 int unregister_hardswish_op()
 {
     return unregister_op(OP_HARDSWISH, 1);
diff --git a/source/operator/prototype/input.c b/source/operator/prototype/input.c
index a9166e60a..551d18ee3 100644
--- a/source/operator/prototype/input.c
+++ b/source/operator/prototype/input.c
@@ -28,7 +28,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int init_op(ir_op_t* op)
 {
     op->same_shape = 1;
@@ -37,9 +36,9 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
-static void release_op(ir_op_t* op) {}
-
+static void release_op(ir_op_t* op)
+{
+}
 
 int register_input_op()
 {
@@ -49,10 +48,9 @@ int register_input_op()
     m.init = init_op;
     m.release = release_op;
 
-    return register_op(OP_INPUT, OP_INPUT_NAME , &m);
+    return register_op(OP_INPUT, OP_INPUT_NAME, &m);
 }
 
-
 int unregister_input_op()
 {
     return unregister_op(OP_INPUT, 1);
diff --git a/source/operator/prototype/instancenorm.c b/source/operator/prototype/instancenorm.c
index 279e45adc..001f474ae 100644
--- a/source/operator/prototype/instancenorm.c
+++ b/source/operator/prototype/instancenorm.c
@@ -34,7 +34,6 @@
 
 #include <string.h>
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -46,10 +45,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct instancenorm_Param* param = ( struct instancenorm_Param* )sys_malloc(sizeof(struct instancenorm_Param));
+    struct instancenorm_Param* param = (struct instancenorm_Param*)sys_malloc(sizeof(struct instancenorm_Param));
 
     if (param == NULL)
     {
@@ -66,13 +64,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_instancenorm_op()
 {
     struct method m;
@@ -84,7 +80,6 @@ int register_instancenorm_op()
     return register_op(OP_INSTANCENORM, OP_INSTANCENORM_NAME, &m);
 }
 
-
 int unregister_instancenorm_op()
 {
     return unregister_op(OP_INSTANCENORM, 1);
diff --git a/source/operator/prototype/interp.c b/source/operator/prototype/interp.c
index 63815c5dd..88c36499b 100644
--- a/source/operator/prototype/interp.c
+++ b/source/operator/prototype/interp.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -42,7 +41,7 @@ static int infer_shape(struct node* node)
     int in_h = input->dims[2];
     int in_w = input->dims[3];
 
-    struct interp_param* param = ( struct interp_param* )(node->op.param_mem);
+    struct interp_param* param = (struct interp_param*)(node->op.param_mem);
 
     if (param == NULL)
     {
@@ -56,8 +55,8 @@ static int infer_shape(struct node* node)
     }
     else
     {
-        param->height_scale = (float )param->output_height / (float )in_h;
-        param->width_scale = (float )param->output_width / (float )in_w;
+        param->height_scale = (float)param->output_height / (float)in_h;
+        param->width_scale = (float)param->output_width / (float)in_w;
     }
 
     int dim[4] = {0};
@@ -72,10 +71,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct interp_param* interp_param = ( struct interp_param* )sys_malloc(sizeof(struct interp_param));
+    struct interp_param* interp_param = (struct interp_param*)sys_malloc(sizeof(struct interp_param));
 
     if (interp_param == NULL)
     {
@@ -97,13 +95,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_interp_op()
 {
     struct method m;
@@ -115,7 +111,6 @@ int register_interp_op()
     return register_op(OP_INTERP, OP_INTERP_NAME, &m);
 }
 
-
 int unregister_interp_op()
 {
     return unregister_op(OP_INTERP, 1);
diff --git a/source/operator/prototype/l2normalization.c b/source/operator/prototype/l2normalization.c
index 8089cc4bd..b6dfed267 100644
--- a/source/operator/prototype/l2normalization.c
+++ b/source/operator/prototype/l2normalization.c
@@ -30,7 +30,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -42,7 +41,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -51,9 +49,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
-static void release_op(struct op* op) {}
-
+static void release_op(struct op* op)
+{
+}
 
 int register_l2normalization_op()
 {
@@ -66,7 +64,6 @@ int register_l2normalization_op()
     return register_op(OP_L2NORMALIZATION, OP_L2NORMALIZATION_NAME, &m);
 }
 
-
 int unregister_l2normalization_op()
 {
     return unregister_op(OP_L2NORMALIZATION, 1);
diff --git a/source/operator/prototype/l2pool.c b/source/operator/prototype/l2pool.c
index b4e24f355..d5c79cbe6 100644
--- a/source/operator/prototype/l2pool.c
+++ b/source/operator/prototype/l2pool.c
@@ -31,26 +31,28 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct l2pool_param* l2pool_param = (struct l2pool_param* )(node->op.param_mem);
+    struct l2pool_param* l2pool_param = (struct l2pool_param*)(node->op.param_mem);
 
     int input_h = input_tensor->dims[1];
     int input_w = input_tensor->dims[2];
     int output_h = 0;
     int output_w = 0;
 
-    if(l2pool_param->paddingType == 1){
-        output_h = (input_h + l2pool_param->stride_h -1 )/l2pool_param->stride_h;
-        output_w = (input_w + l2pool_param->stride_w -1 )/l2pool_param->stride_w;
-    } else {
-        output_h = (input_h + l2pool_param->stride_h - l2pool_param->kernel_h)/l2pool_param->stride_h;
-        output_w = (input_w + l2pool_param->stride_w - l2pool_param->kernel_w)/l2pool_param->stride_w;
+    if (l2pool_param->paddingType == 1)
+    {
+        output_h = (input_h + l2pool_param->stride_h - 1) / l2pool_param->stride_h;
+        output_w = (input_w + l2pool_param->stride_w - 1) / l2pool_param->stride_w;
+    }
+    else
+    {
+        output_h = (input_h + l2pool_param->stride_h - l2pool_param->kernel_h) / l2pool_param->stride_h;
+        output_w = (input_w + l2pool_param->stride_w - l2pool_param->kernel_w) / l2pool_param->stride_w;
     }
     int dims[4];
     dims[0] = input_tensor->dims[0];
@@ -61,13 +63,11 @@ static int infer_shape(struct node* node)
     set_ir_tensor_shape(output_tensor, dims, 4);
 
     return 0;
-
 }
 
-
 static int init_op(struct op* op)
 {
-    struct l2pool_param* l2pool_param = ( struct l2pool_param* )sys_malloc(sizeof(struct l2pool_param));
+    struct l2pool_param* l2pool_param = (struct l2pool_param*)sys_malloc(sizeof(struct l2pool_param));
 
     if (l2pool_param == NULL)
     {
@@ -82,13 +82,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_l2pool_op()
 {
     struct method m;
@@ -96,13 +94,10 @@ int register_l2pool_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_L2POOL, OP_L2POOL_NAME, &m);
-
 }
 
-
 int unregister_l2pool_op()
 {
-    return unregister_op(OP_L2POOL,1);
+    return unregister_op(OP_L2POOL, 1);
 }
diff --git a/source/operator/prototype/l2pool_param.h b/source/operator/prototype/l2pool_param.h
index 477242241..57eef5352 100644
--- a/source/operator/prototype/l2pool_param.h
+++ b/source/operator/prototype/l2pool_param.h
@@ -25,7 +25,8 @@
 #ifndef __L2POOL_H__
 #define __L2POOL_H__
 
-enum{
+enum
+{
     kNone = 0,
     kSame,
     kValid
diff --git a/source/operator/prototype/logical.c b/source/operator/prototype/logical.c
index 4091ff40f..ddb66ead4 100644
--- a/source/operator/prototype/logical.c
+++ b/source/operator/prototype/logical.c
@@ -32,7 +32,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     if (node->input_num == 1)
@@ -64,10 +63,9 @@ static int infer_shape(struct node* node)
     return -1;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct logical_param* logical_param = ( struct logical_param* )sys_malloc(sizeof(struct logical_param));
+    struct logical_param* logical_param = (struct logical_param*)sys_malloc(sizeof(struct logical_param));
 
     if (logical_param == NULL)
     {
@@ -85,13 +83,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_logical_op()
 {
     struct method m;
@@ -103,7 +99,6 @@ int register_logical_op()
     return register_op(OP_LOGICAL, OP_LOGICAL_NAME, &m);
 }
 
-
 int unregister_logical_op()
 {
     return unregister_op(OP_LOGICAL, 1);
diff --git a/source/operator/prototype/logsoftmax.c b/source/operator/prototype/logsoftmax.c
index 73b44db13..3fc1946ba 100644
--- a/source/operator/prototype/logsoftmax.c
+++ b/source/operator/prototype/logsoftmax.c
@@ -32,7 +32,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -44,10 +43,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct logsoftmax_param* logsoftmax_param = ( struct logsoftmax_param* )sys_malloc(sizeof(struct logsoftmax_param));
+    struct logsoftmax_param* logsoftmax_param = (struct logsoftmax_param*)sys_malloc(sizeof(struct logsoftmax_param));
 
     if (logsoftmax_param == NULL)
     {
@@ -64,13 +62,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_logsoftmax_op()
 {
     struct method m;
@@ -81,8 +77,7 @@ int register_logsoftmax_op()
     return register_op(OP_LOGSOFTMAX, OP_LOGSOFTMAX_NAME, &m);
 }
 
-
 int unregister_logsoftmax_op()
 {
-    return unregister_op(OP_LOGSOFTMAX,1);
+    return unregister_op(OP_LOGSOFTMAX, 1);
 }
diff --git a/source/operator/prototype/lrn.c b/source/operator/prototype/lrn.c
index a45e9655e..b24d6efed 100644
--- a/source/operator/prototype/lrn.c
+++ b/source/operator/prototype/lrn.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -43,10 +42,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct lrn_param* lrn_param = ( struct lrn_param* )sys_malloc(sizeof(struct lrn_param));
+    struct lrn_param* lrn_param = (struct lrn_param*)sys_malloc(sizeof(struct lrn_param));
 
     if (lrn_param == NULL)
     {
@@ -68,13 +66,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_lrn_op()
 {
     struct method m;
@@ -83,11 +79,9 @@ int register_lrn_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_LRN, OP_LRN_NAME, &m);
 }
 
-
 int unregister_lrn_op()
 {
     return unregister_op(OP_LRN, 1);
diff --git a/source/operator/prototype/lstm.c b/source/operator/prototype/lstm.c
index af55daccd..280a86013 100644
--- a/source/operator/prototype/lstm.c
+++ b/source/operator/prototype/lstm.c
@@ -32,13 +32,12 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct lstm_param* lstm_param = ( struct lstm_param* )(node->op.param_mem);
+    struct lstm_param* lstm_param = (struct lstm_param*)(node->op.param_mem);
     int batch_size = input->dims[1];
     if (lstm_param->mxnet_flag == 0)
     {
@@ -64,10 +63,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    lstm_param_t* lstm_param = ( lstm_param_t* )sys_malloc(sizeof(lstm_param_t));
+    lstm_param_t* lstm_param = (lstm_param_t*)sys_malloc(sizeof(lstm_param_t));
 
     if (lstm_param == NULL)
     {
@@ -95,13 +93,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_lstm_op()
 {
     struct method m;
@@ -113,7 +109,6 @@ int register_lstm_op()
     return register_op(OP_LSTM, OP_LSTM_NAME, &m);
 }
 
-
 int unregister_lstm_op()
 {
     return unregister_op(OP_LSTM, 1);
diff --git a/source/operator/prototype/lstm_param.h b/source/operator/prototype/lstm_param.h
index 0aa111974..9fc8ff60c 100644
--- a/source/operator/prototype/lstm_param.h
+++ b/source/operator/prototype/lstm_param.h
@@ -26,7 +26,7 @@
 #define __LSTM_PARAM_H__
 
 #define LSTM_ACT_SIGMOID 1
-#define LSTM_ACT_TANH 2
+#define LSTM_ACT_TANH    2
 typedef struct lstm_param
 {
     float forget_bias;
diff --git a/source/operator/prototype/matmul.c b/source/operator/prototype/matmul.c
index 262a4e742..74a93c0d1 100644
--- a/source/operator/prototype/matmul.c
+++ b/source/operator/prototype/matmul.c
@@ -29,7 +29,6 @@
 #include "module/module.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -77,7 +76,6 @@ static int infer_shape(struct node* node)
     return -1;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -85,7 +83,6 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 int register_matmul_op()
 {
     struct method m;
@@ -94,11 +91,9 @@ int register_matmul_op()
     m.init = init_op;
     m.release = NULL;
 
-
     return register_op(OP_MATMUL, OP_MATMUL_NAME, &m);
 }
 
-
 int unregister_matmul_op()
 {
     return unregister_op(OP_MATMUL, 1);
diff --git a/source/operator/prototype/maximum.c b/source/operator/prototype/maximum.c
index 2d72a812c..d0bf587c9 100644
--- a/source/operator/prototype/maximum.c
+++ b/source/operator/prototype/maximum.c
@@ -28,7 +28,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 1;
@@ -36,7 +35,6 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 int register_maximum_op()
 {
     struct method m;
@@ -47,7 +45,6 @@ int register_maximum_op()
     return register_op(OP_MAXIMUM, OP_MAXIMUM_NAME, &m);
 }
 
-
 int unregister_maximum_op()
 {
     return unregister_op(OP_MAXIMUM, 1);
diff --git a/source/operator/prototype/mean.c b/source/operator/prototype/mean.c
index 345f2473e..f4d9ec962 100644
--- a/source/operator/prototype/mean.c
+++ b/source/operator/prototype/mean.c
@@ -28,7 +28,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 1;
@@ -36,7 +35,6 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 int register_mean_op()
 {
     struct method m;
@@ -47,7 +45,6 @@ int register_mean_op()
     return register_op(OP_MEAN, OP_MEAN_NAME, &m);
 }
 
-
 int unregister_mean_op()
 {
     return unregister_op(OP_MEAN, 1);
diff --git a/source/operator/prototype/minimum.c b/source/operator/prototype/minimum.c
index fb83dcbbd..9dc624feb 100644
--- a/source/operator/prototype/minimum.c
+++ b/source/operator/prototype/minimum.c
@@ -30,7 +30,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 1;
@@ -38,7 +37,6 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 int register_minimum_op()
 {
     struct method m;
@@ -49,7 +47,6 @@ int register_minimum_op()
     return register_op(OP_MINIMUM, OP_MINIMUM_NAME, &m);
 }
 
-
 int unregister_minimum_op()
 {
     return unregister_op(OP_MINIMUM, 1);
diff --git a/source/operator/prototype/mish.c b/source/operator/prototype/mish.c
index a47e0e83a..bd45c69e8 100644
--- a/source/operator/prototype/mish.c
+++ b/source/operator/prototype/mish.c
@@ -27,7 +27,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -39,7 +38,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -48,9 +46,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
-static void release_op(struct op* op) {}
-
+static void release_op(struct op* op)
+{
+}
 
 int register_mish_op()
 {
@@ -63,7 +61,6 @@ int register_mish_op()
     return register_op(OP_MISH, OP_MISH_NAME, &m);
 }
 
-
 int unregister_mish_op()
 {
     return unregister_op(OP_MISH, 1);
diff --git a/source/operator/prototype/mvn.c b/source/operator/prototype/mvn.c
index 4be7fe36a..b200b2e7a 100644
--- a/source/operator/prototype/mvn.c
+++ b/source/operator/prototype/mvn.c
@@ -32,10 +32,9 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int init_op(struct op* op)
 {
-    struct mvn_param* param = ( struct mvn_param* )sys_malloc(sizeof(struct mvn_param));
+    struct mvn_param* param = (struct mvn_param*)sys_malloc(sizeof(struct mvn_param));
 
     if (param == NULL)
     {
@@ -51,13 +50,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_mvn_op()
 {
     struct method m;
@@ -69,7 +66,6 @@ int register_mvn_op()
     return register_op(OP_MVN, OP_MVN_NAME, &m);
 }
 
-
 int unregister_mvn_op()
 {
     return unregister_op(OP_MVN, 1);
diff --git a/source/operator/prototype/noop.c b/source/operator/prototype/noop.c
index 3df6c4e9a..1ad30a9bd 100644
--- a/source/operator/prototype/noop.c
+++ b/source/operator/prototype/noop.c
@@ -28,7 +28,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 1;
@@ -37,13 +36,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     // sys_free(op->param_mem);
 }
 
-
 int register_noop_op()
 {
     struct method m;
@@ -55,7 +52,6 @@ int register_noop_op()
     return register_op(OP_NOOP, OP_NOOP_NAME, &m);
 }
 
-
 int unregister_noop_op()
 {
     return unregister_op(OP_NOOP, 1);
diff --git a/source/operator/prototype/normalize.c b/source/operator/prototype/normalize.c
index efbfbabe4..ec3068ce2 100644
--- a/source/operator/prototype/normalize.c
+++ b/source/operator/prototype/normalize.c
@@ -31,10 +31,9 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int init_op(struct op* op)
 {
-    normalize_param_t* normalize_param = ( normalize_param_t* )sys_malloc(sizeof(normalize_param_t));
+    normalize_param_t* normalize_param = (normalize_param_t*)sys_malloc(sizeof(normalize_param_t));
 
     if (normalize_param == NULL)
     {
@@ -52,13 +51,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_normalize_op()
 {
     struct method m;
@@ -70,7 +67,6 @@ int register_normalize_op()
     return register_op(OP_NORMALIZE, OP_NORMALIZE_NAME, &m);
 }
 
-
 int unregister_normalize_op()
 {
     return unregister_op(OP_NORMALIZE, 1);
diff --git a/source/operator/prototype/pad.c b/source/operator/prototype/pad.c
index a04c6a968..c96ad367e 100644
--- a/source/operator/prototype/pad.c
+++ b/source/operator/prototype/pad.c
@@ -31,18 +31,16 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* graph = node->graph;
     ir_tensor_t* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     ir_tensor_t* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct pad_param* pad_param = ( struct pad_param* )(node->op.param_mem);
+    struct pad_param* pad_param = (struct pad_param*)(node->op.param_mem);
 
     int dims[TE_MAX_SHAPE_DIM_NUM] = {0};
-    if (pad_param->pad_0_h != -1 && pad_param->pad_0_w != -1 && pad_param->pad_1_h != -1 && pad_param->pad_1_w != -1 &&
-        pad_param->pad_2_h != -1 && pad_param->pad_2_w != -1 && pad_param->pad_3_h != -1 && pad_param->pad_3_w != -1)
+    if (pad_param->pad_0_h != -1 && pad_param->pad_0_w != -1 && pad_param->pad_1_h != -1 && pad_param->pad_1_w != -1 && pad_param->pad_2_h != -1 && pad_param->pad_2_w != -1 && pad_param->pad_3_h != -1 && pad_param->pad_3_w != -1)
     {
         dims[0] = input->dims[0] + pad_param->pad_0_h + pad_param->pad_0_w;
         dims[1] = input->dims[1] + pad_param->pad_1_h + pad_param->pad_1_w;
@@ -59,10 +57,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    struct pad_param* pad_param = ( struct pad_param* )sys_malloc(sizeof(struct pad_param));
+    struct pad_param* pad_param = (struct pad_param*)sys_malloc(sizeof(struct pad_param));
 
     if (pad_param == NULL)
     {
@@ -70,13 +67,13 @@ static int init_op(ir_op_t* op)
     }
 
     pad_param->mode = 0;
-    pad_param->pad_0_h = -1;    // n
+    pad_param->pad_0_h = -1; // n
     pad_param->pad_0_w = -1;
-    pad_param->pad_1_h = -1;    // c
+    pad_param->pad_1_h = -1; // c
     pad_param->pad_1_w = -1;
-    pad_param->pad_2_h = -1;    // h
+    pad_param->pad_2_h = -1; // h
     pad_param->pad_2_w = -1;
-    pad_param->pad_3_h = -1;    // w
+    pad_param->pad_3_h = -1; // w
     pad_param->pad_3_w = -1;
     pad_param->value = 0;
 
@@ -89,13 +86,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_pad_op()
 {
     ir_method_t m;
@@ -107,7 +102,6 @@ int register_pad_op()
     return register_op(OP_PAD, OP_PAD_NAME, &m);
 }
 
-
 int unregister_pad_op()
 {
     return unregister_op(OP_PAD, 1);
diff --git a/source/operator/prototype/pad_param.h b/source/operator/prototype/pad_param.h
index 87228a054..166a90a9f 100644
--- a/source/operator/prototype/pad_param.h
+++ b/source/operator/prototype/pad_param.h
@@ -28,13 +28,13 @@ struct pad_param
 {
     // mode : 0: CONSTANT; 1: REFLECT; 2: SYMMETRIC.
     int mode;
-    int pad_0_h;    // n
+    int pad_0_h; // n
     int pad_0_w;
-    int pad_1_h;    // c
+    int pad_1_h; // c
     int pad_1_w;
-    int pad_2_h;    // h
+    int pad_2_h; // h
     int pad_2_w;
-    int pad_3_h;    // w
+    int pad_3_h; // w
     int pad_3_w;
-    float value;    // pad value
+    float value; // pad value
 };
diff --git a/source/operator/prototype/permute.c b/source/operator/prototype/permute.c
index 49568214b..234071341 100644
--- a/source/operator/prototype/permute.c
+++ b/source/operator/prototype/permute.c
@@ -31,13 +31,12 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    permute_param_t* param = ( struct permute_param* )(node->op.param_mem);
+    permute_param_t* param = (struct permute_param*)(node->op.param_mem);
 
     int dims[TE_MAX_SHAPE_DIM_NUM] = {0};
     int dim_size = input->dim_num;
@@ -67,10 +66,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct permute_param* permute_param = ( struct permute_param* )sys_malloc(sizeof(struct permute_param));
+    struct permute_param* permute_param = (struct permute_param*)sys_malloc(sizeof(struct permute_param));
 
     if (permute_param == NULL)
     {
@@ -91,13 +89,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_permute_op()
 {
     struct method m;
@@ -109,7 +105,6 @@ int register_permute_op()
     return register_op(OP_PERMUTE, OP_PERMUTE_NAME, &m);
 }
 
-
 int unregister_permute_op()
 {
     return unregister_op(OP_PERMUTE, 1);
diff --git a/source/operator/prototype/pooling.c b/source/operator/prototype/pooling.c
index 0ca3d1b7a..7a4a80e2d 100644
--- a/source/operator/prototype/pooling.c
+++ b/source/operator/prototype/pooling.c
@@ -31,13 +31,12 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
     ir_tensor_t* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     ir_tensor_t* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct pool_param* pool_param = ( struct pool_param* )node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)node->op.param_mem;
 
     int batch = input->dims[0];
     int channel = input->dims[1];
@@ -45,9 +44,7 @@ static int infer_shape(ir_node_t* node)
     int input_w = input->dims[3];
     int output_h, output_w;
 
-    if (pool_param->kernel_h == input_h && pool_param->kernel_w == input_w &&
-        pool_param->pad_w0 == 0 && pool_param->pad_w1 == 0 &&
-        pool_param->pad_h0 == 0 && pool_param->pad_h1 == 0)
+    if (pool_param->kernel_h == input_h && pool_param->kernel_w == input_w && pool_param->pad_w0 == 0 && pool_param->pad_w1 == 0 && pool_param->pad_h0 == 0 && pool_param->pad_h1 == 0)
     {
         pool_param->global = 1;
     }
@@ -102,10 +99,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    struct pool_param* pool_param = ( struct pool_param* )sys_malloc(sizeof(struct pool_param));
+    struct pool_param* pool_param = (struct pool_param*)sys_malloc(sizeof(struct pool_param));
 
     if (pool_param == NULL)
     {
@@ -137,13 +133,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_pooling_op()
 {
     ir_method_t m;
@@ -155,7 +149,6 @@ int register_pooling_op()
     return register_op(OP_POOL, OP_POOL_NAME, &m);
 }
 
-
 int unregister_pooling_op()
 {
     return unregister_op(OP_POOL, 1);
diff --git a/source/operator/prototype/pooling_param.h b/source/operator/prototype/pooling_param.h
index 6bd28d9ea..214c3df33 100644
--- a/source/operator/prototype/pooling_param.h
+++ b/source/operator/prototype/pooling_param.h
@@ -35,7 +35,7 @@ enum
 
 struct pool_param
 {
-    int pool_method;    // 0:max    1:avg
+    int pool_method; // 0:max    1:avg
     int kernel_h;
     int kernel_w;
     int stride_h;
@@ -44,7 +44,7 @@ struct pool_param
     int pad_h1;
     int pad_w0;
     int pad_w1;
-    int global;    // 0:general    1:global
+    int global; // 0:general    1:global
     int caffe_flavor;
     void* funct;
 
diff --git a/source/operator/prototype/prelu.c b/source/operator/prototype/prelu.c
index 1496333d3..8fd689a45 100644
--- a/source/operator/prototype/prelu.c
+++ b/source/operator/prototype/prelu.c
@@ -27,7 +27,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -39,7 +38,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -48,9 +46,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
-static void release_op(struct op* op) {}
-
+static void release_op(struct op* op)
+{
+}
 
 int register_prelu_op()
 {
@@ -60,11 +58,9 @@ int register_prelu_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_PRELU, OP_PRELU_NAME, &m);
 }
 
-
 int unregister_prelu_op()
 {
     return unregister_op(OP_PRELU, 1);
diff --git a/source/operator/prototype/priorbox.c b/source/operator/prototype/priorbox.c
index 1ef13f0d8..79251ef35 100644
--- a/source/operator/prototype/priorbox.c
+++ b/source/operator/prototype/priorbox.c
@@ -31,10 +31,9 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
-    priorbox_param_t* priorbox_param = ( priorbox_param_t* )node->op.param_mem;
+    priorbox_param_t* priorbox_param = (priorbox_param_t*)node->op.param_mem;
 
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
@@ -76,10 +75,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct priorbox_param* priorbox_param = ( struct priorbox_param* )sys_malloc(sizeof(struct priorbox_param));
+    struct priorbox_param* priorbox_param = (struct priorbox_param*)sys_malloc(sizeof(struct priorbox_param));
 
     if (priorbox_param == NULL)
     {
@@ -96,10 +94,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
-    struct priorbox_param* priorbox_param = ( struct priorbox_param* )op->param_mem;
+    struct priorbox_param* priorbox_param = (struct priorbox_param*)op->param_mem;
 
     if (priorbox_param->aspect_ratio)
         sys_free(priorbox_param->aspect_ratio);
@@ -113,7 +110,6 @@ static void release_op(struct op* op)
     sys_free(op->param_mem);
 }
 
-
 int register_priorbox_op()
 {
     struct method m;
@@ -122,11 +118,9 @@ int register_priorbox_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_PRIORBOX, OP_PRIORBOX_NAME, &m);
 }
 
-
 int unregister_priorbox_op()
 {
     return unregister_op(OP_PRIORBOX, 1);
diff --git a/source/operator/prototype/psroipooling.c b/source/operator/prototype/psroipooling.c
index 30d597d1e..3508571b8 100644
--- a/source/operator/prototype/psroipooling.c
+++ b/source/operator/prototype/psroipooling.c
@@ -32,14 +32,13 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct psroipooling_param* psroipooling_param = ( struct psroipooling_param* )(node->op.param_mem);
+    struct psroipooling_param* psroipooling_param = (struct psroipooling_param*)(node->op.param_mem);
 
     int output_n = input->dims[0];
     int output_c = psroipooling_param->output_dim;
@@ -58,7 +57,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     struct psroipooling_param* psroipooling_param = (struct psroipooling_param*)sys_malloc(sizeof(struct psroipooling_param));
@@ -82,13 +80,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_psroipooling_op()
 {
     struct method m;
@@ -100,7 +96,6 @@ int register_psroipooling_op()
     return register_op(OP_PSROIPOOLING, OP_PSROIPOOLING_NAME, &m);
 }
 
-
 int unregister_psroipooling_op()
 {
     return unregister_op(OP_PSROIPOOLING, 1);
diff --git a/source/operator/prototype/reciprocal.c b/source/operator/prototype/reciprocal.c
index 74724f5ca..91a49689a 100644
--- a/source/operator/prototype/reciprocal.c
+++ b/source/operator/prototype/reciprocal.c
@@ -48,7 +48,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-static void release_op(struct op* op) {}
+static void release_op(struct op* op)
+{
+}
 
 int register_reciprocal_op()
 {
diff --git a/source/operator/prototype/reducel2.c b/source/operator/prototype/reducel2.c
index 107c02475..64871f9df 100644
--- a/source/operator/prototype/reducel2.c
+++ b/source/operator/prototype/reducel2.c
@@ -31,23 +31,22 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct reducel2_param* reducel2_param = ( struct reducel2_param* )node->op.param_mem;
+    struct reducel2_param* reducel2_param = (struct reducel2_param*)node->op.param_mem;
 
     int kd = reducel2_param->keepdim;
     int axis = reducel2_param->axis;
 
-    int* out_dim = ( int* )sys_malloc(input->dim_num * sizeof(int));
+    int* out_dim = (int*)sys_malloc(input->dim_num * sizeof(int));
 
     if (axis < 0)
         axis = axis + input->dim_num;
 
-    for (unsigned int i = 0; i < input->dim_num && i < ( unsigned int )axis; i++)
+    for (unsigned int i = 0; i < input->dim_num && i < (unsigned int)axis; i++)
     {
         out_dim[i] = input->dims[i];
     }
@@ -67,10 +66,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct reducel2_param* reducel2_param = ( struct reducel2_param* )sys_malloc(sizeof(struct reducel2_param));
+    struct reducel2_param* reducel2_param = (struct reducel2_param*)sys_malloc(sizeof(struct reducel2_param));
 
     if (reducel2_param == NULL)
     {
@@ -88,13 +86,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_reducel2_op()
 {
     struct method m;
@@ -106,7 +102,6 @@ int register_reducel2_op()
     return register_op(OP_REDUCEL2, OP_REDUCEL2_NAME, &m);
 }
 
-
 int unregister_reducel2_op()
 {
     return unregister_op(OP_REDUCEL2, 1);
diff --git a/source/operator/prototype/reduction.c b/source/operator/prototype/reduction.c
index cea60886b..128c0d668 100644
--- a/source/operator/prototype/reduction.c
+++ b/source/operator/prototype/reduction.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct reduction_param* reduction_param = (struct reduction_param*)node->op.param_mem;
@@ -41,7 +40,7 @@ static int infer_shape(struct node* node)
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
 
     int kd = reduction_param->keepdim;
-    int* in_dim = ( int* )sys_malloc(input->dim_num * sizeof(int));
+    int* in_dim = (int*)sys_malloc(input->dim_num * sizeof(int));
 
     for (int i = 0; i < input->dim_num; i++)
     {
@@ -64,7 +63,7 @@ static int infer_shape(struct node* node)
     {
         count++;
     }
-    int* new_shape = ( int* )sys_malloc(count * sizeof(int));
+    int* new_shape = (int*)sys_malloc(count * sizeof(int));
     int size = 0;
     if (reduction_param->dim_0 != -2)
     {
@@ -87,7 +86,7 @@ static int infer_shape(struct node* node)
         size++;
     }
 
-    int8_t should_reduced[5] = { 0, 0, 0, 0, 0};
+    int8_t should_reduced[5] = {0, 0, 0, 0, 0};
 
     int reduceddim = 0;
     int real_shape[5] = {0, 1, 2, 3, 4};
@@ -138,7 +137,7 @@ static int infer_shape(struct node* node)
         }
         else
         {
-            int* odim = ( int* )sys_malloc(input->dim_num * sizeof(int));
+            int* odim = (int*)sys_malloc(input->dim_num * sizeof(int));
             for (int i_idx = 0, o_idx = 0; i_idx < input->dim_num; i_idx++)
             {
                 odim[o_idx++] = 1;
@@ -163,7 +162,7 @@ static int infer_shape(struct node* node)
         {
             o_size = input->dim_num;
         }
-        int* odim = ( int* )sys_malloc(o_size * sizeof(int));
+        int* odim = (int*)sys_malloc(o_size * sizeof(int));
         for (int i_idx = 0, o_idx = 0; i_idx < input->dim_num; i_idx++)
         {
             if (!should_reduced[i_idx])
@@ -184,10 +183,9 @@ static int infer_shape(struct node* node)
     }
 }
 
-
 static int init_op(struct op* op)
 {
-    struct reduction_param* reduction_param = ( struct reduction_param* )sys_malloc(sizeof(struct reduction_param));
+    struct reduction_param* reduction_param = (struct reduction_param*)sys_malloc(sizeof(struct reduction_param));
 
     if (reduction_param == NULL)
     {
@@ -209,13 +207,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_reduction_op()
 {
     struct method m;
@@ -224,11 +220,9 @@ int register_reduction_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_REDUCTION, OP_REDUCTION_NAME, &m);
 }
 
-
 int unregister_reduction_op()
 {
     return unregister_op(OP_REDUCTION, 1);
diff --git a/source/operator/prototype/region.c b/source/operator/prototype/region.c
index 8d2c23704..28490cc2f 100644
--- a/source/operator/prototype/region.c
+++ b/source/operator/prototype/region.c
@@ -31,10 +31,9 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int init_op(struct op* op)
 {
-    struct region_param* region_param = ( struct region_param* )sys_malloc(sizeof(struct region_param));
+    struct region_param* region_param = (struct region_param*)sys_malloc(sizeof(struct region_param));
 
     if (region_param == NULL)
     {
@@ -52,13 +51,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_region_op()
 {
     struct method m;
@@ -70,7 +67,6 @@ int register_region_op()
     return register_op(OP_REGION, OP_REGION_NAME, &m);
 }
 
-
 int unregister_region_op()
 {
     return unregister_op(OP_REGION, 1);
diff --git a/source/operator/prototype/relu.c b/source/operator/prototype/relu.c
index 859c5e461..57b179de2 100644
--- a/source/operator/prototype/relu.c
+++ b/source/operator/prototype/relu.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -43,10 +42,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    struct relu_param* relu_param = ( struct relu_param* )sys_malloc(sizeof(struct relu_param));
+    struct relu_param* relu_param = (struct relu_param*)sys_malloc(sizeof(struct relu_param));
 
     if (relu_param == NULL)
     {
@@ -64,13 +62,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_relu_op()
 {
     ir_method_t m;
@@ -82,7 +78,6 @@ int register_relu_op()
     return register_op(OP_RELU, OP_RELU_NAME, &m);
 }
 
-
 int unregister_relu_op()
 {
     return unregister_op(OP_RELU, 1);
diff --git a/source/operator/prototype/relu1.c b/source/operator/prototype/relu1.c
index 96c151888..8ef7f40e1 100644
--- a/source/operator/prototype/relu1.c
+++ b/source/operator/prototype/relu1.c
@@ -22,7 +22,6 @@
  * Author: bzhang@openailab.com
  */
 
-
 #include "api/c_api.h"
 #include "graph/tensor.h"
 #include "graph/node.h"
@@ -30,7 +29,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -42,7 +40,6 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
     op->same_shape = 0;
@@ -51,9 +48,9 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
-static void release_op(ir_op_t* op) {}
-
+static void release_op(ir_op_t* op)
+{
+}
 
 int register_relu1_op()
 {
@@ -66,7 +63,6 @@ int register_relu1_op()
     return register_op(OP_RELU1, OP_RELU1_NAME, &m);
 }
 
-
 int unregister_relu1_op()
 {
     return unregister_op(OP_RELU1, 1);
diff --git a/source/operator/prototype/relu6.c b/source/operator/prototype/relu6.c
index b78a28d39..e2ff0d269 100644
--- a/source/operator/prototype/relu6.c
+++ b/source/operator/prototype/relu6.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -43,7 +42,6 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
     op->same_shape = 0;
@@ -52,9 +50,9 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
-static void release_op(ir_op_t* op) {}
-
+static void release_op(ir_op_t* op)
+{
+}
 
 int register_relu6_op()
 {
@@ -67,7 +65,6 @@ int register_relu6_op()
     return register_op(OP_RELU6, OP_RELU6_NAME, &m);
 }
 
-
 int unregister_relu6_op()
 {
     return unregister_op(OP_RELU6, 1);
diff --git a/source/operator/prototype/reorg.c b/source/operator/prototype/reorg.c
index d61dc30c3..b526ab224 100644
--- a/source/operator/prototype/reorg.c
+++ b/source/operator/prototype/reorg.c
@@ -31,13 +31,12 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    struct reorg_param* reorg_param = ( struct reorg_param* )(node->op.param_mem);
+    struct reorg_param* reorg_param = (struct reorg_param*)(node->op.param_mem);
 
     int stride = reorg_param->stride;
 
@@ -58,10 +57,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct reorg_param* reorg_param = ( struct reorg_param* )sys_malloc(sizeof(struct reorg_param));
+    struct reorg_param* reorg_param = (struct reorg_param*)sys_malloc(sizeof(struct reorg_param));
 
     if (reorg_param == NULL)
     {
@@ -79,13 +77,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_reorg_op()
 {
     struct method m;
@@ -97,7 +93,6 @@ int register_reorg_op()
     return register_op(OP_REORG, OP_REORG_NAME, &m);
 }
 
-
 int unregister_reorg_op()
 {
     return unregister_op(OP_REORG, 1);
diff --git a/source/operator/prototype/reshape.c b/source/operator/prototype/reshape.c
index 7b2f31303..9c9252153 100644
--- a/source/operator/prototype/reshape.c
+++ b/source/operator/prototype/reshape.c
@@ -34,10 +34,9 @@
 
 #include <string.h>
 
-
 static int infer_shape(struct node* node)
 {
-    reshape_param_t* param = ( struct reshape_param* )(node->op.param_mem);
+    reshape_param_t* param = (struct reshape_param*)(node->op.param_mem);
 
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
@@ -59,13 +58,13 @@ static int infer_shape(struct node* node)
             if (param->is_mxnet)
             {
                 int temp = input->dims[in_idx];
-                push_vector_data(new_shape, ( void* )&temp);
+                push_vector_data(new_shape, (void*)&temp);
             }
             else
             {
                 int temp = 1;
                 if (i == 0)
-                    push_vector_data(new_shape, ( void* )&temp);
+                    push_vector_data(new_shape, (void*)&temp);
             }
 
             in_idx++;
@@ -73,20 +72,20 @@ static int infer_shape(struct node* node)
         else if (-1 == param->re_shape[i])
         {
             int temp = -1;
-            push_vector_data(new_shape, ( void* )&temp);
+            push_vector_data(new_shape, (void*)&temp);
             in_idx++;
         }
         else if (-2 == param->re_shape[i])
         {
             for (; in_idx < input_dim_size; ++in_idx)
             {
-                push_vector_data(new_shape, ( void* )&input->dims[in_idx]);
+                push_vector_data(new_shape, (void*)&input->dims[in_idx]);
             }
         }
         else if (-3 == param->re_shape[i])
         {
             int temp = input->dims[in_idx] * input->dims[in_idx + 1];
-            push_vector_data(new_shape, ( void* )&temp);
+            push_vector_data(new_shape, (void*)&temp);
             in_idx = in_idx + 2;
         }
         else if (-4 == param->re_shape[i])
@@ -94,14 +93,14 @@ static int infer_shape(struct node* node)
             int muti_val = param->re_shape[i + 1];
             if (muti_val == -1)
                 muti_val = 1;
-            push_vector_data(new_shape, ( void* )&muti_val);
-            push_vector_data(new_shape, ( void* )&param->re_shape[i + 2]);
+            push_vector_data(new_shape, (void*)&muti_val);
+            push_vector_data(new_shape, (void*)&param->re_shape[i + 2]);
             i = i + 2;
             in_idx++;
         }
         else
         {
-            push_vector_data(new_shape, ( void* )&param->re_shape[i]);
+            push_vector_data(new_shape, (void*)&param->re_shape[i]);
             in_idx++;
         }
     }
@@ -110,7 +109,7 @@ static int infer_shape(struct node* node)
     int dim_size = get_vector_num(new_shape);
     for (int i = 0; i < dim_size; i++)
     {
-        int temp = (( int* )get_vector_data(new_shape, i))[0];
+        int temp = ((int*)get_vector_data(new_shape, i))[0];
         if (temp == -1)
             idx = i;
         else
@@ -120,12 +119,12 @@ static int infer_shape(struct node* node)
     if (idx >= 0)
     {
         int temp = size / new_size;
-        set_vector_data(new_shape, idx, ( void* )&temp);
+        set_vector_data(new_shape, idx, (void*)&temp);
     }
 
-    if ((( int* )get_vector_data(new_shape, 0))[0] == -1 && get_vector_num(new_shape) == 1)
+    if (((int*)get_vector_data(new_shape, 0))[0] == -1 && get_vector_num(new_shape) == 1)
     {
-        set_vector_data(new_shape, 0, ( void* )&size);
+        set_vector_data(new_shape, 0, (void*)&size);
     }
 
     if (param->reverse)
@@ -145,16 +144,17 @@ static int infer_shape(struct node* node)
     }
 
     new_size = 1;
-    int* new_shape_temp = ( int* )sys_malloc(get_vector_num(new_shape) * sizeof(int));
+    int* new_shape_temp = (int*)sys_malloc(get_vector_num(new_shape) * sizeof(int));
 
     for (int i = 0; i < get_vector_num(new_shape); i++)
     {
-        int* a = ( int* )get_vector_data(new_shape, i);
+        int* a = (int*)get_vector_data(new_shape, i);
         new_shape_temp[i] = *a;
         new_size *= new_shape_temp[i];
     }
     // check input and reshaped size
-    if (new_size != size) {
+    if (new_size != size)
+    {
         TLOG_ERR("Error: input elem num(%d) != reshaped elem num(%d)\n", size, new_size);
         return -1;
     }
@@ -169,10 +169,9 @@ static int infer_shape(struct node* node)
     return ret;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct reshape_param* reshape_param = ( struct reshape_param* )sys_malloc(sizeof(struct reshape_param));
+    struct reshape_param* reshape_param = (struct reshape_param*)sys_malloc(sizeof(struct reshape_param));
 
     if (reshape_param == NULL)
     {
@@ -189,10 +188,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
-    struct reshape_param* reshape_param = ( struct reshape_param* )op->param_mem;
+    struct reshape_param* reshape_param = (struct reshape_param*)op->param_mem;
 
     if (reshape_param->re_shape)
         sys_free(reshape_param->re_shape);
@@ -200,7 +198,6 @@ static void release_op(struct op* op)
     sys_free(op->param_mem);
 }
 
-
 int register_reshape_op()
 {
     struct method m;
@@ -212,7 +209,6 @@ int register_reshape_op()
     return register_op(OP_RESHAPE, OP_RESHAPE_NAME, &m);
 }
 
-
 int unregister_reshape_op()
 {
     return unregister_op(OP_RESHAPE, 1);
diff --git a/source/operator/prototype/resize.c b/source/operator/prototype/resize.c
index df40dd8d2..676691c6b 100644
--- a/source/operator/prototype/resize.c
+++ b/source/operator/prototype/resize.c
@@ -32,26 +32,25 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    struct resize_param* resize_param = ( struct resize_param* )(node->op.param_mem);
+    struct resize_param* resize_param = (struct resize_param*)(node->op.param_mem);
 
     int dims[4];
     dims[0] = input->dims[0];
     if (graph->graph_layout == TENGINE_LAYOUT_NCHW)
     {
         dims[1] = input->dims[1];
-        dims[2] = ( int )(input->dims[2] * resize_param->scale_h);
-        dims[3] = ( int )(input->dims[3] * resize_param->scale_w);
+        dims[2] = (int)(input->dims[2] * resize_param->scale_h);
+        dims[3] = (int)(input->dims[3] * resize_param->scale_w);
     }
     else if (graph->graph_layout == TENGINE_LAYOUT_NHWC)
     {
-        dims[1] = ( int )(input->dims[1] * resize_param->scale_h);
-        dims[2] = ( int )(input->dims[2] * resize_param->scale_w);
+        dims[1] = (int)(input->dims[1] * resize_param->scale_h);
+        dims[2] = (int)(input->dims[2] * resize_param->scale_w);
         dims[3] = input->dims[3];
     }
     else
@@ -65,10 +64,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct resize_param* resize_param = ( struct resize_param* )sys_malloc(sizeof(struct resize_param));
+    struct resize_param* resize_param = (struct resize_param*)sys_malloc(sizeof(struct resize_param));
 
     if (resize_param == NULL)
     {
@@ -88,13 +86,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_resize_op()
 {
     struct method m;
@@ -106,7 +102,6 @@ int register_resize_op()
     return register_op(OP_RESIZE, OP_RESIZE_NAME, &m);
 }
 
-
 int unregister_resize_op()
 {
     return unregister_op(OP_RESIZE, 1);
diff --git a/source/operator/prototype/resize_param.h b/source/operator/prototype/resize_param.h
index 49c069bfb..d0748eef7 100644
--- a/source/operator/prototype/resize_param.h
+++ b/source/operator/prototype/resize_param.h
@@ -29,7 +29,7 @@ struct resize_param
 {
     float scale_w;
     float scale_h;
-    int type;    // 0 for NEAREST_NEIGHBOR   // 1 for BILIEAR
+    int type; // 0 for NEAREST_NEIGHBOR   // 1 for BILIEAR
 };
 
 #endif
diff --git a/source/operator/prototype/reverse.c b/source/operator/prototype/reverse.c
index 4098491d7..31fc1406f 100644
--- a/source/operator/prototype/reverse.c
+++ b/source/operator/prototype/reverse.c
@@ -27,7 +27,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -39,7 +38,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -48,13 +46,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     // sys_free(op->param_mem);
 }
 
-
 int register_reverse_op()
 {
     struct method m;
@@ -66,7 +62,6 @@ int register_reverse_op()
     return register_op(OP_REVERSE, OP_REVERSE_NAME, &m);
 }
 
-
 int unregister_reverse_op()
 {
     return unregister_op(OP_REVERSE, 1);
diff --git a/source/operator/prototype/rnn.c b/source/operator/prototype/rnn.c
index f037f95f9..0973a9f24 100644
--- a/source/operator/prototype/rnn.c
+++ b/source/operator/prototype/rnn.c
@@ -31,13 +31,12 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    struct rnn_param* rnn_param = ( struct rnn_param* )(node->op.param_mem);
+    struct rnn_param* rnn_param = (struct rnn_param*)(node->op.param_mem);
     int dims[3];
 
     // input tensors:
@@ -57,10 +56,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct rnn_param* rnn_param = ( struct rnn_param* )sys_malloc(sizeof(struct rnn_param));
+    struct rnn_param* rnn_param = (struct rnn_param*)sys_malloc(sizeof(struct rnn_param));
 
     if (rnn_param == NULL)
     {
@@ -77,13 +75,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_rnn_op()
 {
     struct method m;
@@ -95,7 +91,6 @@ int register_rnn_op()
     return register_op(OP_RNN, OP_RNN_NAME, &m);
 }
 
-
 int unregister_rnn_op()
 {
     return unregister_op(OP_RNN, 1);
diff --git a/source/operator/prototype/roialign.c b/source/operator/prototype/roialign.c
index 999ec7afa..74b6b37ed 100644
--- a/source/operator/prototype/roialign.c
+++ b/source/operator/prototype/roialign.c
@@ -31,14 +31,13 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
 
-    struct roialign_param* roialign_param = ( struct roialign_param* )(node->op.param_mem);
+    struct roialign_param* roialign_param = (struct roialign_param*)(node->op.param_mem);
 
     int out_dim[4];
 
@@ -52,10 +51,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct roialign_param* roialign_param = ( struct roialign_param* )sys_malloc(sizeof(struct roialign_param));
+    struct roialign_param* roialign_param = (struct roialign_param*)sys_malloc(sizeof(struct roialign_param));
 
     if (roialign_param == NULL)
     {
@@ -75,13 +73,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_roialign_op()
 {
     struct method m;
@@ -90,11 +86,9 @@ int register_roialign_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_ROIALIGN, OP_ROIALIGN_NAME, &m);
 }
 
-
 int unregister_roialign_op()
 {
     return unregister_op(OP_ROIALIGN, 1);
diff --git a/source/operator/prototype/roipooling.c b/source/operator/prototype/roipooling.c
index 16cf9de0d..b2e13791c 100644
--- a/source/operator/prototype/roipooling.c
+++ b/source/operator/prototype/roipooling.c
@@ -31,13 +31,12 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct roipooling_param* roipooling_param = ( struct roipooling_param* )node->op.param_mem;
+    struct roipooling_param* roipooling_param = (struct roipooling_param*)node->op.param_mem;
 
     int dims[4];
 
@@ -51,10 +50,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct roipooling_param* roipooling_param = ( struct roipooling_param* )sys_malloc(sizeof(struct roipooling_param));
+    struct roipooling_param* roipooling_param = (struct roipooling_param*)sys_malloc(sizeof(struct roipooling_param));
 
     if (roipooling_param == NULL)
     {
@@ -72,13 +70,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_roipooling_op()
 {
     struct method m;
diff --git a/source/operator/prototype/round.c b/source/operator/prototype/round.c
index a938b2f58..491637253 100644
--- a/source/operator/prototype/round.c
+++ b/source/operator/prototype/round.c
@@ -27,7 +27,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -39,7 +38,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -48,13 +46,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     // sys_free(op->param_mem);
 }
 
-
 int register_round_op()
 {
     struct method m;
@@ -66,7 +62,6 @@ int register_round_op()
     return register_op(OP_ROUND, OP_ROUND_NAME, &m);
 }
 
-
 int unregister_round_op()
 {
     return unregister_op(OP_ROUND, 1);
diff --git a/source/operator/prototype/rpn.c b/source/operator/prototype/rpn.c
index 424416a67..762653b65 100644
--- a/source/operator/prototype/rpn.c
+++ b/source/operator/prototype/rpn.c
@@ -34,7 +34,6 @@
 
 #include <math.h>
 
-
 void mkanchor(float w, float h, float x_ctr, float y_ctr, Anchor_t* tmp)
 {
     tmp->x0 = (x_ctr - 0.5f * (w - 1));
@@ -43,7 +42,6 @@ void mkanchor(float w, float h, float x_ctr, float y_ctr, Anchor_t* tmp)
     tmp->y1 = (y_ctr + 0.5f * (h - 1));
 }
 
-
 void whctrs(const Anchor_t anchor, Box_t* result)
 {
     result->w = (anchor.x1 - anchor.x0 + 1);
@@ -52,41 +50,38 @@ void whctrs(const Anchor_t anchor, Box_t* result)
     result->cy = ((anchor.y1 + anchor.y0) * 0.5f);
 }
 
-
 void scale_enum(const Anchor_t anchor, const struct vector* anchor_scales_, struct vector* result)
 {
     Box_t tmp_box;
     whctrs(anchor, &tmp_box);
 
-    for (int i = 0; i < ( int )anchor_scales_->elem_num; ++i)
+    for (int i = 0; i < (int)anchor_scales_->elem_num; ++i)
     {
         Anchor_t tmp;
 
-        float as_val = *( float* )(get_vector_data(( struct vector* )anchor_scales_, i));
+        float as_val = *(float*)(get_vector_data((struct vector*)anchor_scales_, i));
         mkanchor(tmp_box.w * as_val, tmp_box.h * as_val, tmp_box.cx, tmp_box.cy, &tmp);
         push_vector_data(result, &tmp);
     }
 }
 
-
 void ratio_enum(const Anchor_t anchor, const struct vector* ratios_, struct vector* result)
 {
     Box_t tmp_box;
     whctrs(anchor, &tmp_box);
     float area = tmp_box.h * tmp_box.w;
 
-    for (int i = 0; i < ( int )ratios_->elem_num; ++i)
+    for (int i = 0; i < (int)ratios_->elem_num; ++i)
     {
-        float size_ratio = area / *( float* )(get_vector_data(( struct vector* )ratios_, i));
+        float size_ratio = area / *(float*)(get_vector_data((struct vector*)ratios_, i));
         Anchor_t tmp;
         float new_w = roundf(sqrt(size_ratio));
-        float new_h = roundf(new_w * *( float* )(get_vector_data(( struct vector* )ratios_, i)));
+        float new_h = roundf(new_w * *(float*)(get_vector_data((struct vector*)ratios_, i)));
         mkanchor(new_w, new_h, tmp_box.cx, tmp_box.cy, &tmp);
         push_vector_data(result, &tmp);
     }
 }
 
-
 void generate_anchors(const int base_size, const struct vector* ratios_, const struct vector* scales_,
                       struct vector* gen_anchors_)
 {
@@ -99,14 +94,14 @@ void generate_anchors(const int base_size, const struct vector* ratios_, const s
     struct vector* ratio_anchors = create_vector(sizeof(struct Anchor), NULL);
 
     ratio_enum(base_anchor, ratios_, ratio_anchors);
-    for (int i = 0; i < ( int )ratio_anchors->elem_num; ++i)
+    for (int i = 0; i < (int)ratio_anchors->elem_num; ++i)
     {
         struct vector* scale_anchors = create_vector(sizeof(struct Anchor), NULL);
 
-        scale_enum(*( Anchor_t* )get_vector_data(ratio_anchors, i), scales_, scale_anchors);
+        scale_enum(*(Anchor_t*)get_vector_data(ratio_anchors, i), scales_, scale_anchors);
         for (int j = 0; j < scale_anchors->elem_num; j++)
         {
-            Anchor_t tmp_s = *( Anchor_t* )get_vector_data(scale_anchors, j);
+            Anchor_t tmp_s = *(Anchor_t*)get_vector_data(scale_anchors, j);
             push_vector_data(gen_anchors_, &tmp_s);
         }
 
@@ -121,7 +116,7 @@ static int infer_shape(struct node* node)
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    rpn_param_t* rpn_param = ( rpn_param_t* )node->op.param_mem;
+    rpn_param_t* rpn_param = (rpn_param_t*)node->op.param_mem;
 
     rpn_param->anchors_ = create_vector(sizeof(struct Anchor), NULL);
     generate_anchors(rpn_param->basesize, rpn_param->ratios, rpn_param->anchor_scales, rpn_param->anchors_);
@@ -136,10 +131,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct rpn_param* rpn_param = ( struct rpn_param* )sys_malloc(sizeof(struct rpn_param));
+    struct rpn_param* rpn_param = (struct rpn_param*)sys_malloc(sizeof(struct rpn_param));
 
     if (rpn_param == NULL)
     {
@@ -160,10 +154,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
-    struct rpn_param* rpn_param = ( struct rpn_param* )op->param_mem;
+    struct rpn_param* rpn_param = (struct rpn_param*)op->param_mem;
 
     if (rpn_param->anchors_)
         release_vector(rpn_param->anchors_);
@@ -175,7 +168,6 @@ static void release_op(struct op* op)
     sys_free(op->param_mem);
 }
 
-
 int register_rpn_op()
 {
     struct method m;
@@ -184,11 +176,9 @@ int register_rpn_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_RPN, OP_RPN_NAME, &m);
 }
 
-
 int unregister_rpn_op()
 {
     return unregister_op(OP_RPN, 1);
diff --git a/source/operator/prototype/scale.c b/source/operator/prototype/scale.c
index b51ad1e2e..0eaa7fc5a 100644
--- a/source/operator/prototype/scale.c
+++ b/source/operator/prototype/scale.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -43,10 +42,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    struct scale_param* scale_param = ( struct scale_param* )sys_malloc(sizeof(struct scale_param));
+    struct scale_param* scale_param = (struct scale_param*)sys_malloc(sizeof(struct scale_param));
 
     if (scale_param == NULL)
     {
@@ -66,13 +64,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_scale_op()
 {
     ir_method_t m;
@@ -84,7 +80,6 @@ int register_scale_op()
     return register_op(OP_SCALE, OP_SCALE_NAME, &m);
 }
 
-
 int unregister_scale_op()
 {
     return unregister_op(OP_SCALE, 1);
diff --git a/source/operator/prototype/scatter.c b/source/operator/prototype/scatter.c
index 24b7baa8a..1c98cf9e6 100644
--- a/source/operator/prototype/scatter.c
+++ b/source/operator/prototype/scatter.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -42,7 +41,6 @@ static int infer_shape(struct node* node)
     return ret;
 }
 
-
 static int init_op(struct op* op)
 {
     struct scatter_param* scatter_param = (struct scatter_param*)sys_malloc(sizeof(struct scatter_param));
@@ -63,13 +61,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_scatter_op()
 {
     struct method m;
@@ -78,11 +74,9 @@ int register_scatter_op()
     m.release = release_op;
 
     return register_op(OP_SCATTER, OP_SCATTER_NAME, &m);
-
 }
 
-
 int unregister_scatter_op()
 {
-    return unregister_op(OP_SCATTER,1);
+    return unregister_op(OP_SCATTER, 1);
 }
diff --git a/source/operator/prototype/selu.c b/source/operator/prototype/selu.c
index d4bd5e479..4422d4ced 100644
--- a/source/operator/prototype/selu.c
+++ b/source/operator/prototype/selu.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -43,10 +42,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct selu_param* selu_param = ( struct selu_param* )sys_malloc(sizeof(struct selu_param));
+    struct selu_param* selu_param = (struct selu_param*)sys_malloc(sizeof(struct selu_param));
 
     if (selu_param == NULL)
     {
@@ -65,13 +63,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_selu_op()
 {
     struct method m;
@@ -83,7 +79,6 @@ int register_selu_op()
     return register_op(OP_SELU, OP_SELU_NAME, &m);
 }
 
-
 int unregister_selu_op()
 {
     return unregister_op(OP_SELU, 1);
diff --git a/source/operator/prototype/shape.c b/source/operator/prototype/shape.c
index 365ae6a5e..68dcde0dd 100644
--- a/source/operator/prototype/shape.c
+++ b/source/operator/prototype/shape.c
@@ -27,7 +27,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -39,7 +38,6 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
     op->same_shape = 0;
@@ -48,9 +46,9 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
-static void release_op(ir_op_t* op) {}
-
+static void release_op(ir_op_t* op)
+{
+}
 
 int register_shape_op()
 {
@@ -63,7 +61,6 @@ int register_shape_op()
     return register_op(OP_SHAPE, OP_SHAPE_NAME, &m);
 }
 
-
 int unregister_shape_op()
 {
     return unregister_op(OP_SHAPE, 1);
diff --git a/source/operator/prototype/shuffle_channel.c b/source/operator/prototype/shuffle_channel.c
index ba0898575..bd23d739b 100644
--- a/source/operator/prototype/shuffle_channel.c
+++ b/source/operator/prototype/shuffle_channel.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -43,11 +42,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct shuffle_channel_param* param =
-        ( struct shuffle_channel_param* )sys_malloc(sizeof(struct shuffle_channel_param));
+    struct shuffle_channel_param* param = (struct shuffle_channel_param*)sys_malloc(sizeof(struct shuffle_channel_param));
 
     if (param == NULL)
     {
@@ -66,13 +63,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_shuffle_channel_op()
 {
     struct method m;
@@ -84,7 +79,6 @@ int register_shuffle_channel_op()
     return register_op(OP_SHUFFLECHANNEL, OP_SHUFFLECHANNEL_NAME, &m);
 }
 
-
 int unregister_shuffle_channel_op()
 {
     return unregister_op(OP_SHUFFLECHANNEL, 1);
diff --git a/source/operator/prototype/sigmoid.c b/source/operator/prototype/sigmoid.c
index bbb249777..13eabcc7f 100644
--- a/source/operator/prototype/sigmoid.c
+++ b/source/operator/prototype/sigmoid.c
@@ -28,7 +28,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -40,7 +39,6 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
     op->same_shape = 0;
@@ -49,13 +47,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     // sys_free(op->param_mem);
 }
 
-
 int register_sigmoid_op()
 {
     ir_method_t m;
@@ -68,7 +64,6 @@ int register_sigmoid_op()
     return register_op(OP_SIGMOID, OP_SIGMOID_NAME, &m);
 }
 
-
 int unregister_sigmoid_op()
 {
     // sys_free(GET_PARAM_PARSE_MAP(sigmoid_param));
diff --git a/source/operator/prototype/slice.c b/source/operator/prototype/slice.c
index 67a61eab5..018369094 100644
--- a/source/operator/prototype/slice.c
+++ b/source/operator/prototype/slice.c
@@ -31,19 +31,20 @@
 #include "module/module.h"
 #include "utility/vector.h"
 #include "utility/sys_port.h"
-#include "utility/log.h"    // for: TLOG_ERR
+#include "utility/log.h" // for: TLOG_ERR
 
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
     ir_tensor_t* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
-    struct slice_param* slice_param = ( struct slice_param* )(node->op.param_mem);
+    struct slice_param* slice_param = (struct slice_param*)(node->op.param_mem);
     int dims_len = input->dim_num;
     int dims_in[TE_MAX_SHAPE_DIM_NUM * 2];
 
     // Check: axis must be in the range: [-input->dim_num, input->dim_num)
     // Note: Here we always assume 0 <= input->dim_num
-    if (slice_param->axis < -input->dim_num || input->dim_num <= slice_param->axis) {
+    if (slice_param->axis < -input->dim_num || input->dim_num <= slice_param->axis)
+    {
         TLOG_ERR("Input slice axis %d not to be supported.\n", slice_param->axis);
         return -1;
     }
@@ -67,8 +68,8 @@ static int infer_shape(ir_node_t* node)
             unsigned int i = 0;
             for (; i < slice_param->slice_point_->elem_num; ++i)
             {
-                dims_in[slice_axis] = (*( int* )get_vector_data(slice_param->slice_point_, i) - prev);
-                prev = *( int* )get_vector_data(slice_param->slice_point_, i);
+                dims_in[slice_axis] = (*(int*)get_vector_data(slice_param->slice_point_, i) - prev);
+                prev = *(int*)get_vector_data(slice_param->slice_point_, i);
                 set_ir_tensor_shape(get_ir_graph_tensor(ir_graph, node->output_tensors[i]), dims_in, dims_len);
             }
             // The last one
@@ -80,7 +81,7 @@ static int infer_shape(ir_node_t* node)
             int out_num = node->output_num;
             if (dims_in[slice_axis] % out_num != 0)
                 return -1;
-            if (slice_axis > ( int )dims_len)
+            if (slice_axis > (int)dims_len)
                 return -1;
             dims_in[slice_axis] = dims_in[slice_axis] / out_num;
             for (int i = 0; i < out_num; i++)
@@ -158,22 +159,20 @@ static int infer_shape(ir_node_t* node)
         int dim_len = input->dim_num;
         int out_dims[TE_MAX_SHAPE_DIM_NUM * 2];
         // input shape size must be equal to begin and size's size;
-        if ((slice_param->size_->elem_num != slice_param->begin_->elem_num) ||
-            (slice_param->size_->elem_num != dim_len))
+        if ((slice_param->size_->elem_num != slice_param->begin_->elem_num) || (slice_param->size_->elem_num != dim_len))
             return -1;
         for (unsigned int i = 0; i < dim_len; i++)
         {
-            out_dims[i] = *( int* )get_vector_data(slice_param->size_, i);
+            out_dims[i] = *(int*)get_vector_data(slice_param->size_, i);
         }
         set_ir_tensor_shape(get_ir_graph_tensor(ir_graph, node->output_tensors[0]), out_dims, dim_len);
     }
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    slice_param_t* slice_param = ( slice_param_t* )sys_malloc(sizeof(slice_param_t));
+    slice_param_t* slice_param = (slice_param_t*)sys_malloc(sizeof(slice_param_t));
 
     if (slice_param == NULL)
     {
@@ -194,10 +193,9 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
-    slice_param_t* slice_param = ( slice_param_t* )op->param_mem;
+    slice_param_t* slice_param = (slice_param_t*)op->param_mem;
 
     if (slice_param->slice_point_)
         release_vector(slice_param->slice_point_);
@@ -209,7 +207,6 @@ static void release_op(ir_op_t* op)
     sys_free(op->param_mem);
 }
 
-
 int register_slice_op()
 {
     ir_method_t m;
@@ -221,7 +218,6 @@ int register_slice_op()
     return register_op(OP_SLICE, OP_SLICE_NAME, &m);
 }
 
-
 int unregister_slice_op()
 {
     return unregister_op(OP_SLICE, 1);
diff --git a/source/operator/prototype/slice_param.h b/source/operator/prototype/slice_param.h
index dad3a6f96..1e1e8c605 100644
--- a/source/operator/prototype/slice_param.h
+++ b/source/operator/prototype/slice_param.h
@@ -26,7 +26,6 @@
 
 #include "stdint.h"
 
-
 typedef struct slice_param
 {
     struct vector* slice_point_;
diff --git a/source/operator/prototype/softmax.c b/source/operator/prototype/softmax.c
index e0cf47d63..e9e3a3a65 100644
--- a/source/operator/prototype/softmax.c
+++ b/source/operator/prototype/softmax.c
@@ -31,7 +31,6 @@
 #include "utility/vector.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -45,10 +44,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    struct softmax_param* softmax_param = ( struct softmax_param* )sys_malloc(sizeof(struct softmax_param));
+    struct softmax_param* softmax_param = (struct softmax_param*)sys_malloc(sizeof(struct softmax_param));
 
     if (softmax_param == NULL)
     {
@@ -66,13 +64,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_softmax_op()
 {
     ir_method_t m;
@@ -84,7 +80,6 @@ int register_softmax_op()
     return register_op(OP_SOFTMAX, OP_SOFTMAX_NAME, &m);
 }
 
-
 int unregister_softmax_op()
 {
     return unregister_op(OP_SOFTMAX, 1);
diff --git a/source/operator/prototype/softplus.c b/source/operator/prototype/softplus.c
index 8c5754015..0ffc08336 100644
--- a/source/operator/prototype/softplus.c
+++ b/source/operator/prototype/softplus.c
@@ -29,7 +29,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -49,7 +48,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-static void release_op(struct op* op) {}
+static void release_op(struct op* op)
+{
+}
 
 int register_softplus_op()
 {
diff --git a/source/operator/prototype/spacetobatchnd.c b/source/operator/prototype/spacetobatchnd.c
index aeb4fa5b7..5e94c9397 100644
--- a/source/operator/prototype/spacetobatchnd.c
+++ b/source/operator/prototype/spacetobatchnd.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct spacetobatchnd_param* spacetobatchnd_param = (struct spacetobatchnd_param*)(node->op.param_mem);
@@ -43,10 +42,8 @@ static int infer_shape(struct node* node)
     int out_dim[4];
 
     out_dim[0] = input->dims[0] * (spacetobatchnd_param->dilation_x) * (spacetobatchnd_param->dilation_y);
-    out_dim[1] = (input->dims[1] + spacetobatchnd_param->pad_top + spacetobatchnd_param->pad_bottom) /
-                 spacetobatchnd_param->dilation_y;
-    out_dim[2] = (input->dims[2] + spacetobatchnd_param->pad_left + spacetobatchnd_param->pad_right) /
-                 spacetobatchnd_param->dilation_x;
+    out_dim[1] = (input->dims[1] + spacetobatchnd_param->pad_top + spacetobatchnd_param->pad_bottom) / spacetobatchnd_param->dilation_y;
+    out_dim[2] = (input->dims[2] + spacetobatchnd_param->pad_left + spacetobatchnd_param->pad_right) / spacetobatchnd_param->dilation_x;
     out_dim[3] = input->dims[3];
 
     set_ir_tensor_shape(output, out_dim, 4);
@@ -54,11 +51,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct spacetobatchnd_param* spacetobatchnd_param =
-        ( struct spacetobatchnd_param* )sys_malloc(sizeof(struct spacetobatchnd_param));
+    struct spacetobatchnd_param* spacetobatchnd_param = (struct spacetobatchnd_param*)sys_malloc(sizeof(struct spacetobatchnd_param));
 
     if (spacetobatchnd_param == NULL)
     {
@@ -81,13 +76,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_spacetobatchnd_op()
 {
     struct method m;
@@ -99,7 +92,6 @@ int register_spacetobatchnd_op()
     return register_op(OP_SPACETOBATCHND, OP_SPACETOBATCHND_NAME, &m);
 }
 
-
 int unregister_spacetobatchnd_op()
 {
     return unregister_op(OP_SPACETOBATCHND, 1);
diff --git a/source/operator/prototype/spacetodepth.c b/source/operator/prototype/spacetodepth.c
index 5680da626..85e4a5e9e 100644
--- a/source/operator/prototype/spacetodepth.c
+++ b/source/operator/prototype/spacetodepth.c
@@ -31,10 +31,9 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
-    struct spacetodepth_param* spacetodepth_param = ( struct spacetodepth_param* )(node->op.param_mem);
+    struct spacetodepth_param* spacetodepth_param = (struct spacetodepth_param*)(node->op.param_mem);
 
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
@@ -44,21 +43,19 @@ static int infer_shape(struct node* node)
     int dims[4];
     int block_size = spacetodepth_param->block_size;
 
-    dims[0] = input->dims[0];    // batch
-    dims[1] = input->dims[1] * (block_size * block_size);    // channel
-    dims[2] = input->dims[2] / block_size;    // height
-    dims[3] = input->dims[3] / block_size;    // width
+    dims[0] = input->dims[0];                             // batch
+    dims[1] = input->dims[1] * (block_size * block_size); // channel
+    dims[2] = input->dims[2] / block_size;                // height
+    dims[3] = input->dims[3] / block_size;                // width
 
     set_ir_tensor_shape(output, dims, 4);
 
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct spacetodepth_param* spacetodepth_param =
-        ( struct spacetodepth_param* )sys_malloc(sizeof(struct spacetodepth_param));
+    struct spacetodepth_param* spacetodepth_param = (struct spacetodepth_param*)sys_malloc(sizeof(struct spacetodepth_param));
 
     if (spacetodepth_param == NULL)
     {
@@ -76,13 +73,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_spacetodepth_op()
 {
     struct method m;
@@ -91,11 +86,9 @@ int register_spacetodepth_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_SPACETODEPTH, OP_SPACETODEPTH_NAME, &m);
 }
 
-
 int unregister_spacetodepth_op()
 {
     return unregister_op(OP_SPACETODEPTH, 1);
diff --git a/source/operator/prototype/sparsetodense.c b/source/operator/prototype/sparsetodense.c
index e9539802c..af8dffa45 100644
--- a/source/operator/prototype/sparsetodense.c
+++ b/source/operator/prototype/sparsetodense.c
@@ -31,10 +31,9 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
-    struct sparsetodense_param* sparsetodense_param = ( struct sparsetodense_param* )(node->op.param_mem);
+    struct sparsetodense_param* sparsetodense_param = (struct sparsetodense_param*)(node->op.param_mem);
 
     struct graph* graph = node->graph;
     struct tensor* input0 = get_ir_graph_tensor(graph, node->input_tensors[0]);
@@ -67,7 +66,6 @@ static int infer_shape(struct node* node)
     }
 }
 
-
 static int init_op(struct op* op)
 {
     struct sparsetodense_param* sparsetodense_param = (struct sparsetodense_param*)sys_malloc(sizeof(struct sparsetodense_param));
@@ -90,13 +88,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_sparsetodense_op()
 {
     struct method m;
@@ -108,7 +104,6 @@ int register_sparsetodense_op()
     return register_op(OP_SPARSETODENSE, OP_SPARSETODENSE_NAME, &m);
 }
 
-
 int unregister_sparsetodense_op()
 {
     return unregister_op(OP_SPARSETODENSE, 1);
diff --git a/source/operator/prototype/spatialtransformer.c b/source/operator/prototype/spatialtransformer.c
index 41b089e84..f3962bd1d 100644
--- a/source/operator/prototype/spatialtransformer.c
+++ b/source/operator/prototype/spatialtransformer.c
@@ -34,10 +34,9 @@
 
 #include <string.h>
 
-
 static int infer_shape(struct node* node)
 {
-    struct spatialtransformer_param* param = ( struct spatialtransformer_param* )(node->op.param_mem);
+    struct spatialtransformer_param* param = (struct spatialtransformer_param*)(node->op.param_mem);
 
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
@@ -45,25 +44,27 @@ static int infer_shape(struct node* node)
 
     struct vector* new_shape = create_vector(sizeof(int), NULL);
     int dim_size = 2;
-    for(int i = 0; i < dim_size; i++ ){
+    for (int i = 0; i < dim_size; i++)
+    {
         int shape = param->target_shape[i];
         push_vector_data(new_shape, (void*)&shape);
     }
 
-    int out_dim_size =4;
-    int* new_shape_temp = ( int* )sys_malloc(out_dim_size * sizeof(int));
+    int out_dim_size = 4;
+    int* new_shape_temp = (int*)sys_malloc(out_dim_size * sizeof(int));
 
-    if(dim_size == 2){
+    if (dim_size == 2)
+    {
         for (int i = 0; i < get_vector_num(new_shape); i++)
         {
-            int* a = ( int* )get_vector_data(new_shape, i);
-            new_shape_temp[i+dim_size] = *a;
+            int* a = (int*)get_vector_data(new_shape, i);
+            new_shape_temp[i + dim_size] = *a;
         }
         new_shape_temp[0] = 1;
         new_shape_temp[1] = input->dims[1];
     }
 
-    output->layout  = input->layout;
+    output->layout = input->layout;
     int ret = set_ir_tensor_shape(output, new_shape_temp, out_dim_size);
 
     sys_free(new_shape_temp);
@@ -71,7 +72,6 @@ static int infer_shape(struct node* node)
     return ret;
 }
 
-
 static int init_op(struct op* op)
 {
     struct spatialtransformer_param* spatialtransformer_param = (struct spatialtransformer_param*)sys_malloc(sizeof(struct spatialtransformer_param));
@@ -95,20 +95,16 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
-
-    struct spatialtransformer_param* param = ( struct spatialtransformer_param* )op->param_mem;
+    struct spatialtransformer_param* param = (struct spatialtransformer_param*)op->param_mem;
 
     if (param->target_shape)
         sys_free(param->target_shape);
 
     sys_free(op->param_mem);
-
 }
 
-
 int register_spatialtransformer_op()
 {
     struct method m;
@@ -120,7 +116,6 @@ int register_spatialtransformer_op()
     return register_op(OP_SPATIALTRANSFORMER, OP_SPATIALTRANSFORMER_NAME, &m);
 }
 
-
 int unregister_spatialtransformer_op()
 {
     return unregister_op(OP_SPATIALTRANSFORMER, 1);
diff --git a/source/operator/prototype/split.c b/source/operator/prototype/split.c
index 778d4e888..295a83821 100644
--- a/source/operator/prototype/split.c
+++ b/source/operator/prototype/split.c
@@ -32,12 +32,11 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* graph = node->graph;
     ir_tensor_t* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
-    struct split_param* split_param = ( struct split_param* )(node->op.param_mem);
+    struct split_param* split_param = (struct split_param*)(node->op.param_mem);
 
     int axis = split_param->axis;
 
@@ -62,7 +61,7 @@ static int infer_shape(ir_node_t* node)
 
             for (int i = 0; i < get_vector_num(split_param->split_sizes_); i++)
             {
-                sum_check += (( int* )get_vector_data(split_param->split_sizes_, i))[0];
+                sum_check += ((int*)get_vector_data(split_param->split_sizes_, i))[0];
             }
 
             if (sum_check != input_slice_num)
@@ -73,7 +72,7 @@ static int infer_shape(ir_node_t* node)
 
             for (int i = 0; i < get_vector_num(split_param->split_sizes_); i++)
             {
-                input_dim[axis] = (( int* )get_vector_data(split_param->split_sizes_, i))[0];
+                input_dim[axis] = ((int*)get_vector_data(split_param->split_sizes_, i))[0];
                 ir_tensor_t* output = get_ir_graph_tensor(graph, node->output_tensors[i]);
                 set_ir_tensor_shape(output, input_dim, input->dim_num);
             }
@@ -121,10 +120,9 @@ static int infer_shape(ir_node_t* node)
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
-    struct split_param* split_param = ( struct split_param* )sys_malloc(sizeof(struct split_param));
+    struct split_param* split_param = (struct split_param*)sys_malloc(sizeof(struct split_param));
 
     if (split_param == NULL)
     {
@@ -146,10 +144,9 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
-    struct split_param* split_param = ( struct split_param* )op->param_mem;
+    struct split_param* split_param = (struct split_param*)op->param_mem;
 
     if (split_param->split_sizes_)
         release_vector(split_param->split_sizes_);
@@ -157,7 +154,6 @@ static void release_op(ir_op_t* op)
     sys_free(op->param_mem);
 }
 
-
 int register_split_op()
 {
     ir_method_t m;
@@ -169,7 +165,6 @@ int register_split_op()
     return register_op(OP_SPLIT, OP_SPLIT_NAME, &m);
 }
 
-
 int unregister_split_op()
 {
     return unregister_op(OP_SPLIT, 1);
diff --git a/source/operator/prototype/squareddifference.c b/source/operator/prototype/squareddifference.c
index e7595a2d3..d4753d5db 100644
--- a/source/operator/prototype/squareddifference.c
+++ b/source/operator/prototype/squareddifference.c
@@ -27,7 +27,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -51,7 +50,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -60,9 +58,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
-static void release_op(struct op* op) {}
-
+static void release_op(struct op* op)
+{
+}
 
 int register_squareddifference_op()
 {
@@ -75,7 +73,6 @@ int register_squareddifference_op()
     return register_op(OP_SQUAREDDIFFERENCE, OP_SQUAREDDIFFERENCE_NAME, &m);
 }
 
-
 int unregister_squareddifference_op()
 {
     return unregister_op(OP_SQUAREDDIFFERENCE, 1);
diff --git a/source/operator/prototype/squeeze.c b/source/operator/prototype/squeeze.c
index d2a027925..36767ebd9 100644
--- a/source/operator/prototype/squeeze.c
+++ b/source/operator/prototype/squeeze.c
@@ -32,13 +32,12 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct squeeze_param* squeeze_param = ( struct squeeze_param* )node->op.param_mem;
+    struct squeeze_param* squeeze_param = (struct squeeze_param*)node->op.param_mem;
 
     int in_size = input->dim_num;
 
@@ -65,7 +64,7 @@ static int infer_shape(struct node* node)
         dim_size++;
     }
 
-    int8_t should_squeeze[4] = { 0 };
+    int8_t should_squeeze[4] = {0};
     int squeezeddim = 0;
     int newshape_size = dim_size;
     int real_shape[4] = {0, 2, 3, 1};
@@ -111,7 +110,7 @@ static int infer_shape(struct node* node)
         }
     }
 
-    int* odim = ( int* )sys_malloc((in_size - squeezeddim) * sizeof(int));
+    int* odim = (int*)sys_malloc((in_size - squeezeddim) * sizeof(int));
     int o_idx = 0;
     for (int i_idx = 0; i_idx < in_size; i_idx++)
     {
@@ -125,10 +124,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct squeeze_param* squeeze_param = ( struct squeeze_param* )sys_malloc(sizeof(struct squeeze_param));
+    struct squeeze_param* squeeze_param = (struct squeeze_param*)sys_malloc(sizeof(struct squeeze_param));
 
     if (squeeze_param == NULL)
     {
@@ -148,13 +146,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_squeeze_op()
 {
     struct method m;
@@ -166,7 +162,6 @@ int register_squeeze_op()
     return register_op(OP_SQUEEZE, OP_SQUEEZE_NAME, &m);
 }
 
-
 int unregister_squeeze_op()
 {
     return unregister_op(OP_SQUEEZE, 1);
diff --git a/source/operator/prototype/strided_slice.c b/source/operator/prototype/strided_slice.c
index 8bf798699..1a1cac9b4 100644
--- a/source/operator/prototype/strided_slice.c
+++ b/source/operator/prototype/strided_slice.c
@@ -33,30 +33,25 @@
 
 #include <math.h>
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct strided_slice_param* param_ = ( struct strided_slice_param* )(node->op.param_mem);
+    struct strided_slice_param* param_ = (struct strided_slice_param*)(node->op.param_mem);
 
-    int delta_0 = (-param_->begin[0] + param_->end[0]) < 0 ? param_->begin[0] - param_->end[0] :
-                                                             -param_->begin[0] + param_->end[0];
-    int delta_1 = (-param_->begin[1] + param_->end[1]) < 0 ? param_->begin[1] - param_->end[1] :
-                                                             -param_->begin[1] + param_->end[1];
-    int delta_2 = (-param_->begin[2] + param_->end[2]) < 0 ? param_->begin[2] - param_->end[2] :
-                                                             -param_->begin[2] + param_->end[2];
-    int delta_3 = (-param_->begin[3] + param_->end[3]) < 0 ? param_->begin[3] - param_->end[3] :
-                                                             -param_->begin[3] + param_->end[3];
+    int delta_0 = (-param_->begin[0] + param_->end[0]) < 0 ? param_->begin[0] - param_->end[0] : -param_->begin[0] + param_->end[0];
+    int delta_1 = (-param_->begin[1] + param_->end[1]) < 0 ? param_->begin[1] - param_->end[1] : -param_->begin[1] + param_->end[1];
+    int delta_2 = (-param_->begin[2] + param_->end[2]) < 0 ? param_->begin[2] - param_->end[2] : -param_->begin[2] + param_->end[2];
+    int delta_3 = (-param_->begin[3] + param_->end[3]) < 0 ? param_->begin[3] - param_->end[3] : -param_->begin[3] + param_->end[3];
 
     int dims[4] = {0};
-    dims[0] = ceil((( float )input->dims[0] - ( float )delta_0) / ( float )param_->stride[0]);
-    dims[1] = ceil((( float )input->dims[1] - ( float )delta_1) / ( float )param_->stride[1]);
-    dims[2] = ceil((( float )input->dims[2] - ( float )delta_2) / ( float )param_->stride[2]);
-    dims[3] = ceil((( float )input->dims[3] - ( float )delta_3) / ( float )param_->stride[3]);
+    dims[0] = ceil(((float)input->dims[0] - (float)delta_0) / (float)param_->stride[0]);
+    dims[1] = ceil(((float)input->dims[1] - (float)delta_1) / (float)param_->stride[1]);
+    dims[2] = ceil(((float)input->dims[2] - (float)delta_2) / (float)param_->stride[2]);
+    dims[3] = ceil(((float)input->dims[3] - (float)delta_3) / (float)param_->stride[3]);
 
-    for (int i=0; i<4; i++)
+    for (int i = 0; i < 4; i++)
     {
         if (dims[i] == 0)
             dims[i] = 1;
@@ -67,11 +62,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct strided_slice_param* strided_slice_param =
-        ( struct strided_slice_param* )sys_malloc(sizeof(struct strided_slice_param));
+    struct strided_slice_param* strided_slice_param = (struct strided_slice_param*)sys_malloc(sizeof(struct strided_slice_param));
 
     if (strided_slice_param == NULL)
     {
@@ -93,13 +86,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_strided_slice_op()
 {
     struct method m;
@@ -111,7 +102,6 @@ int register_strided_slice_op()
     return register_op(OP_STRIDED_SLICE, OP_STRIDEDSLICE_NAME, &m);
 }
 
-
 int unregister_strided_slice_op()
 {
     return unregister_op(OP_STRIDED_SLICE, 1);
diff --git a/source/operator/prototype/swap_axis.c b/source/operator/prototype/swap_axis.c
index 66d9561ca..22f4ba96b 100644
--- a/source/operator/prototype/swap_axis.c
+++ b/source/operator/prototype/swap_axis.c
@@ -31,13 +31,12 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct swap_axis_param* swap_axis_param = ( struct swap_axis_param* )node->op.param_mem;
+    struct swap_axis_param* swap_axis_param = (struct swap_axis_param*)node->op.param_mem;
 
     if (swap_axis_param->dim_0 == swap_axis_param->dim_1)
     {
@@ -59,7 +58,7 @@ static int infer_shape(struct node* node)
     if (swap_axis_param->dim_0 >= in_size || swap_axis_param->dim_1 >= in_size)
         return -1;
 
-    int* newdim = ( int* )sys_malloc(in_size * sizeof(int));
+    int* newdim = (int*)sys_malloc(in_size * sizeof(int));
     for (int i = 0; i < in_size; i++)
     {
         newdim[i] = input->dims[i];
@@ -72,10 +71,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct swap_axis_param* swap_axis_param = ( struct swap_axis_param* )sys_malloc(sizeof(struct swap_axis_param));
+    struct swap_axis_param* swap_axis_param = (struct swap_axis_param*)sys_malloc(sizeof(struct swap_axis_param));
 
     if (swap_axis_param == NULL)
     {
@@ -93,13 +91,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_swap_axis_op()
 {
     struct method m;
@@ -111,7 +107,6 @@ int register_swap_axis_op()
     return register_op(OP_SWAP_AXIS, OP_SWAP_AXIS_NAME, &m);
 }
 
-
 int unregister_swap_axis_op()
 {
     return unregister_op(OP_SWAP_AXIS, 1);
diff --git a/source/operator/prototype/tanh.c b/source/operator/prototype/tanh.c
index 96cda2e32..aa40d591c 100644
--- a/source/operator/prototype/tanh.c
+++ b/source/operator/prototype/tanh.c
@@ -28,7 +28,6 @@
 #include "module/module.h"
 #include "utility/vector.h"
 
-
 static int infer_shape(ir_node_t* node)
 {
     ir_graph_t* ir_graph = node->graph;
@@ -36,13 +35,12 @@ static int infer_shape(ir_node_t* node)
     ir_tensor_t* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
 
     output->layout = input->layout;
-    
+
     set_ir_tensor_shape(output, input->dims, input->dim_num);
 
     return 0;
 }
 
-
 static int init_op(ir_op_t* op)
 {
     op->same_shape = 0;
@@ -51,13 +49,11 @@ static int init_op(ir_op_t* op)
     return 0;
 }
 
-
 static void release_op(ir_op_t* op)
 {
     // sys_free(op->param_mem);
 }
 
-
 int register_tanh_op()
 {
     ir_method_t m;
@@ -70,7 +66,6 @@ int register_tanh_op()
     return register_op(OP_TANH, OP_TANH_NAME, &m);
 }
 
-
 int unregister_tanh_op()
 {
     // sys_free(GET_PARAM_PARSE_MAP(tanh_param));
diff --git a/source/operator/prototype/threshold.c b/source/operator/prototype/threshold.c
index 3af483dd8..8d98c24af 100644
--- a/source/operator/prototype/threshold.c
+++ b/source/operator/prototype/threshold.c
@@ -32,7 +32,6 @@
 
 #include <string.h>
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
@@ -44,10 +43,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct threshold_param* param = ( struct threshold_param* )sys_malloc(sizeof(struct threshold_param));
+    struct threshold_param* param = (struct threshold_param*)sys_malloc(sizeof(struct threshold_param));
 
     if (param == NULL)
     {
@@ -64,13 +62,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_threshold_op()
 {
     struct method m;
@@ -82,7 +78,6 @@ int register_threshold_op()
     return register_op(OP_THRESHOLD, OP_THRESHOLD_NAME, &m);
 }
 
-
 int unregister_threshold_op()
 {
     return unregister_op(OP_THRESHOLD, 1);
diff --git a/source/operator/prototype/tile.c b/source/operator/prototype/tile.c
index 10ba81625..faf647837 100644
--- a/source/operator/prototype/tile.c
+++ b/source/operator/prototype/tile.c
@@ -37,7 +37,6 @@
 #include <stdio.h>
 #endif
 
-
 static int infer_shape(struct node* node)
 {
     struct tile_param* param = (struct tile_param*)node->op.param_mem;
@@ -55,20 +54,20 @@ static int infer_shape(struct node* node)
 
     struct vector* reps_vector = create_vector(sizeof(int), NULL);
 
-    for(int i = 0; i < param->reps_size; i++)
+    for (int i = 0; i < param->reps_size; i++)
     {
         push_vector_data(reps_vector, (void*)&param->reps[i]);
     }
 
-    if(frame == 0) // caffe
+    if (frame == 0) // caffe
     {
         int param_size = get_vector_num(reps_vector);
-        if(param_size != 0)
+        if (param_size != 0)
         {
-            for(int i = 0; i < param_size / 2; i++)
+            for (int i = 0; i < param_size / 2; i++)
             {
-                int temp = ((int*)get_vector_data(reps_vector,0))[0];
-                int ori_reps = ((int*)get_vector_data(reps_vector, param_size -i -1))[0];
+                int temp = ((int*)get_vector_data(reps_vector, 0))[0];
+                int ori_reps = ((int*)get_vector_data(reps_vector, param_size - i - 1))[0];
                 set_vector_data(reps_vector, i, (void*)&ori_reps);
             }
         }
@@ -77,45 +76,45 @@ static int infer_shape(struct node* node)
             return -1;
         }
         int push_data = 1;
-        switch(param_size)
+        switch (param_size)
         {
-            case 0:
-                for(int i = 0; i < 4; i++)
-                {
-                    push_vector_data(reps_vector, (void*)&push_data);
-                }
-                break;
-            case 1:
-                for(int i = 0; i < 3; i++)
-                {
-                    push_vector_data(reps_vector, (void*)&push_data);
-                };
-                break;
-            case 2:
-                for(int i = 0; i < 2; i++)
-                {
-                    push_vector_data(reps_vector, (void*)&push_data);
-                }
-                break;
-            case 3:
+        case 0:
+            for (int i = 0; i < 4; i++)
+            {
+                push_vector_data(reps_vector, (void*)&push_data);
+            }
+            break;
+        case 1:
+            for (int i = 0; i < 3; i++)
+            {
+                push_vector_data(reps_vector, (void*)&push_data);
+            };
+            break;
+        case 2:
+            for (int i = 0; i < 2; i++)
+            {
                 push_vector_data(reps_vector, (void*)&push_data);
-                break;
-            default:
-                break;
+            }
+            break;
+        case 3:
+            push_vector_data(reps_vector, (void*)&push_data);
+            break;
+        default:
+            break;
         }
 
-        output_n = input_tensor->dims[0]*(( int* )get_vector_data(reps_vector, 3))[0];
-        output_c = input_tensor->dims[1]*(( int* )get_vector_data(reps_vector, 2))[0];
-        output_h = input_tensor->dims[2]*(( int* )get_vector_data(reps_vector, 1))[0];
-        output_w = input_tensor->dims[3]*(( int* )get_vector_data(reps_vector, 0))[0];
-    } 
-    else if (frame == 1) 
+        output_n = input_tensor->dims[0] * ((int*)get_vector_data(reps_vector, 3))[0];
+        output_c = input_tensor->dims[1] * ((int*)get_vector_data(reps_vector, 2))[0];
+        output_h = input_tensor->dims[2] * ((int*)get_vector_data(reps_vector, 1))[0];
+        output_w = input_tensor->dims[3] * ((int*)get_vector_data(reps_vector, 0))[0];
+    }
+    else if (frame == 1)
     {
         printf("Tile::InferShape onnx\n");
     }
 
-    int* new_shape = (int*)sys_malloc(get_vector_num(reps_vector)*sizeof(int));
-    for(int i = 0; i < get_vector_num(reps_vector); i++)
+    int* new_shape = (int*)sys_malloc(get_vector_num(reps_vector) * sizeof(int));
+    for (int i = 0; i < get_vector_num(reps_vector); i++)
     {
         int* a = (int*)get_vector_data(reps_vector, i);
         new_shape[i] = *a;
@@ -127,17 +126,16 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct tile_param* tile_param = ( struct tile_param* )sys_malloc(sizeof(struct tile_param));
+    struct tile_param* tile_param = (struct tile_param*)sys_malloc(sizeof(struct tile_param));
 
     if (tile_param == NULL)
     {
         return -1;
     }
 
-    memset(tile_param,0,sizeof(struct tile_param));
+    memset(tile_param, 0, sizeof(struct tile_param));
     op->param_mem = tile_param;
     op->param_size = sizeof(struct tile_param);
     op->same_shape = 0;
@@ -146,16 +144,14 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     struct tile_param* tile_param = (struct tile_param*)op->param_mem;
-    if(tile_param->reps)
+    if (tile_param->reps)
         sys_free(tile_param->reps);
     sys_free(op->param_mem);
 }
 
-
 int register_tile_op()
 {
     struct method m;
@@ -163,13 +159,10 @@ int register_tile_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_TILE, OP_TILE_NAME, &m);
-
 }
 
-
 int unregister_tile_op()
 {
-    return unregister_op(OP_TILE,1);
+    return unregister_op(OP_TILE, 1);
 }
diff --git a/source/operator/prototype/topkv2.c b/source/operator/prototype/topkv2.c
index 50ddfbbb2..a98e773e2 100644
--- a/source/operator/prototype/topkv2.c
+++ b/source/operator/prototype/topkv2.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct topkv2_param* topkv2_param = (struct topkv2_param*)node->op.param_mem;
@@ -42,7 +41,7 @@ static int infer_shape(struct node* node)
     struct tensor* output1 = get_ir_graph_tensor(ir_graph, node->output_tensors[1]);
 
     int in_size = input->dim_num;
-    int* in_dim = ( int* )sys_malloc((in_size) * sizeof(int));
+    int* in_dim = (int*)sys_malloc((in_size) * sizeof(int));
 
     if (topkv2_param->k > input->dims[in_size - 1])
     {
@@ -61,10 +60,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct topkv2_param* topkv2_param = ( struct topkv2_param* )sys_malloc(sizeof(struct topkv2_param));
+    struct topkv2_param* topkv2_param = (struct topkv2_param*)sys_malloc(sizeof(struct topkv2_param));
 
     if (topkv2_param == NULL)
     {
@@ -82,13 +80,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_topkv2_op()
 {
     struct method m;
@@ -97,11 +93,9 @@ int register_topkv2_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_TOPKV2, OP_TOPKV2_NAME, &m);
 }
 
-
 int unregister_topkv2_op()
 {
     return unregister_op(OP_TOPKV2, 1);
diff --git a/source/operator/prototype/transpose.c b/source/operator/prototype/transpose.c
index 50024f101..5c211d453 100644
--- a/source/operator/prototype/transpose.c
+++ b/source/operator/prototype/transpose.c
@@ -32,16 +32,15 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
-    struct transpose_param* param = ( struct transpose_param* )(node->op.param_mem);
+    struct transpose_param* param = (struct transpose_param*)(node->op.param_mem);
 
     int new_shape_size = param->tr_shape_size;
-    int* out_dims = ( int* )sys_malloc(new_shape_size * sizeof(int));
+    int* out_dims = (int*)sys_malloc(new_shape_size * sizeof(int));
 
     for (int i = 0; i < new_shape_size; i++)
     {
@@ -54,10 +53,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct transpose_param* transpose_param = ( struct transpose_param* )sys_malloc(sizeof(struct transpose_param));
+    struct transpose_param* transpose_param = (struct transpose_param*)sys_malloc(sizeof(struct transpose_param));
 
     if (transpose_param == NULL)
     {
@@ -78,10 +76,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
-    struct transpose_param* transpose_param = ( struct transpose_param* )op->param_mem;
+    struct transpose_param* transpose_param = (struct transpose_param*)op->param_mem;
 
     if (transpose_param->tr_shape)
         sys_free(transpose_param->tr_shape);
@@ -89,7 +86,6 @@ static void release_op(struct op* op)
     sys_free(op->param_mem);
 }
 
-
 int register_transpose_op()
 {
     struct method m;
@@ -98,11 +94,9 @@ int register_transpose_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_TRANSPOSE, OP_TRANSPOSE_NAME, &m);
 }
 
-
 int unregister_transpose_op()
 {
     return unregister_op(OP_TRANSPOSE, 1);
diff --git a/source/operator/prototype/unary.c b/source/operator/prototype/unary.c
index cb72cfd7f..8c0c64196 100644
--- a/source/operator/prototype/unary.c
+++ b/source/operator/prototype/unary.c
@@ -31,7 +31,6 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -45,10 +44,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct unary_param* unary_param = ( struct unary_param* )sys_malloc(sizeof(struct unary_param));
+    struct unary_param* unary_param = (struct unary_param*)sys_malloc(sizeof(struct unary_param));
 
     if (unary_param == NULL)
     {
@@ -66,13 +64,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_unary_op()
 {
     struct method m;
@@ -81,11 +77,9 @@ int register_unary_op()
     m.init = init_op;
     m.release = release_op;
 
-
     return register_op(OP_UNARY, OP_UNARY_NAME, &m);
 }
 
-
 int unregister_unary_op()
 {
     return unregister_op(OP_UNARY, 1);
diff --git a/source/operator/prototype/unsqueeze.c b/source/operator/prototype/unsqueeze.c
index 196e6795f..77b75f8d5 100644
--- a/source/operator/prototype/unsqueeze.c
+++ b/source/operator/prototype/unsqueeze.c
@@ -31,16 +31,15 @@
 #include "module/module.h"
 #include "utility/sys_port.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
     struct tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
-    struct unsqueeze_param* unsqueeze_param = ( struct unsqueeze_param* )node->op.param_mem;
+    struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)node->op.param_mem;
 
     int axises_size = unsqueeze_param->axises_size;
-    int* out_dim = ( int* )sys_malloc((input->dim_num + axises_size) * sizeof(int));
+    int* out_dim = (int*)sys_malloc((input->dim_num + axises_size) * sizeof(int));
 
     if (axises_size == 1)
     {
@@ -90,7 +89,6 @@ static int infer_shape(struct node* node)
                 out_dim[i] = input->dims[k];
                 k++;
             }
-
         }
     }
 
@@ -101,10 +99,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct unsqueeze_param* unsqueeze_param = ( struct unsqueeze_param* )sys_malloc(sizeof(struct unsqueeze_param));
+    struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)sys_malloc(sizeof(struct unsqueeze_param));
 
     if (unsqueeze_param == NULL)
     {
@@ -121,7 +118,6 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)op->param_mem;
@@ -130,7 +126,6 @@ static void release_op(struct op* op)
     sys_free(op->param_mem);
 }
 
-
 int register_unsqueeze_op()
 {
     struct method m;
@@ -142,7 +137,6 @@ int register_unsqueeze_op()
     return register_op(OP_UNSQUEEZE, OP_UNSQUEEZE_NAME, &m);
 }
 
-
 int unregister_unsqueeze_op()
 {
     return unregister_op(OP_UNSQUEEZE, 1);
diff --git a/source/operator/prototype/upsample.c b/source/operator/prototype/upsample.c
index a026e79c0..fc7e29ebc 100644
--- a/source/operator/prototype/upsample.c
+++ b/source/operator/prototype/upsample.c
@@ -32,10 +32,9 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
-    struct upsample_param* upsample_param = ( struct upsample_param* )(node->op.param_mem);
+    struct upsample_param* upsample_param = (struct upsample_param*)(node->op.param_mem);
 
     struct graph* graph = node->graph;
     struct tensor* input = get_ir_graph_tensor(graph, node->input_tensors[0]);
@@ -55,10 +54,9 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
-    struct upsample_param* upsample_param = ( struct upsample_param* )sys_malloc(sizeof(struct upsample_param));
+    struct upsample_param* upsample_param = (struct upsample_param*)sys_malloc(sizeof(struct upsample_param));
 
     if (upsample_param == NULL)
     {
@@ -76,13 +74,11 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
 static void release_op(struct op* op)
 {
     sys_free(op->param_mem);
 }
 
-
 int register_upsample_op()
 {
     struct method m;
@@ -94,7 +90,6 @@ int register_upsample_op()
     return register_op(OP_UPSAMPLE, OP_UPSAMPLE_NAME, &m);
 }
 
-
 int unregister_upsample_op()
 {
     return unregister_op(OP_UPSAMPLE, 1);
diff --git a/source/operator/prototype/where.c b/source/operator/prototype/where.c
index 8b9285657..4a97d6b22 100644
--- a/source/operator/prototype/where.c
+++ b/source/operator/prototype/where.c
@@ -30,7 +30,6 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -42,7 +41,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -51,9 +49,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
-static void release_op(struct op* op) {}
-
+static void release_op(struct op* op)
+{
+}
 
 int register_where_op()
 {
@@ -66,7 +64,6 @@ int register_where_op()
     return register_op(OP_WHERE, OP_WHERE_NAME, &m);
 }
 
-
 int unregister_where_op()
 {
     return unregister_op(OP_WHERE, 1);
diff --git a/source/operator/prototype/zeroslike.c b/source/operator/prototype/zeroslike.c
index 251adab6c..a6ae52644 100644
--- a/source/operator/prototype/zeroslike.c
+++ b/source/operator/prototype/zeroslike.c
@@ -27,7 +27,6 @@
 #include "graph/graph.h"
 #include "module/module.h"
 
-
 static int infer_shape(struct node* node)
 {
     struct graph* ir_graph = node->graph;
@@ -39,7 +38,6 @@ static int infer_shape(struct node* node)
     return 0;
 }
 
-
 static int init_op(struct op* op)
 {
     op->same_shape = 0;
@@ -48,9 +46,9 @@ static int init_op(struct op* op)
     return 0;
 }
 
-
-static void release_op(struct op* op) {}
-
+static void release_op(struct op* op)
+{
+}
 
 int register_zeroslike_op()
 {
@@ -63,7 +61,6 @@ int register_zeroslike_op()
     return register_op(OP_ZEROSLIKE, OP_ZEROSLIKE_NAME, &m);
 }
 
-
 int unregister_zeroslike_op()
 {
     return unregister_op(OP_ZEROSLIKE, 1);
diff --git a/source/optimizer/estimation.c b/source/optimizer/estimation.c
index 7e8e604ea..2fba81972 100644
--- a/source/optimizer/estimation.c
+++ b/source/optimizer/estimation.c
@@ -39,21 +39,19 @@
 #include <stdlib.h>
 #endif
 
-
 void init_memory_block(memory_block_t* memory_block, uint16_t index)
 {
     if (NULL != memory_block)
     {
         memory_block->index = index;
-        memory_block->size  = 0;
+        memory_block->size = 0;
         memory_block->tensor_count = 0;
-        memory_block->tensor_list  = NULL;
+        memory_block->tensor_list = NULL;
         memory_block->tensor_index = 0;
         memory_block->inuse = 0;
     }
 }
 
-
 memory_block_t* find_unused_memory_block(struct vector* memory_blocks)
 {
     int memory_blocks_count = get_vector_num(memory_blocks);
@@ -69,7 +67,6 @@ memory_block_t* find_unused_memory_block(struct vector* memory_blocks)
     return NULL;
 }
 
-
 memory_block_t* get_usable_memory_block(struct vector* memory_blocks)
 {
     memory_block_t* memory_block = find_unused_memory_block(memory_blocks);
@@ -89,13 +86,12 @@ memory_block_t* get_usable_memory_block(struct vector* memory_blocks)
     return memory_block;
 }
 
-
 int mark_memory_block_with_tensor(ir_graph_t* graph, memory_block_t* memory_block, uint16_t index)
 {
     ir_tensor_t* tensor = get_ir_graph_tensor(graph, index);
 
     memory_block->tensor_count += 1;
-    memory_block->tensor_list  = (uint16_t*)sys_realloc(memory_block->tensor_list, memory_block->tensor_count * sizeof(uint16_t));
+    memory_block->tensor_list = (uint16_t*)sys_realloc(memory_block->tensor_list, memory_block->tensor_count * sizeof(uint16_t));
     memory_block->inuse = 1;
 
     uint32_t tensor_buffer_size = tensor->elem_num * tensor->elem_size;
@@ -108,7 +104,6 @@ int mark_memory_block_with_tensor(ir_graph_t* graph, memory_block_t* memory_bloc
     return 0;
 }
 
-
 int estimate_subgraph_memory_blocks(struct subgraph* subgraph, struct vector* memory_blocks)
 {
     if (NULL == subgraph || NULL == memory_blocks)
diff --git a/source/optimizer/estimation.h b/source/optimizer/estimation.h
index b2f1580bd..3cc51026d 100644
--- a/source/optimizer/estimation.h
+++ b/source/optimizer/estimation.h
@@ -30,22 +30,20 @@
 struct subgraph;
 struct vector;
 
-
 /*!
  * @struct ir_subgraph_t
  * @brief  Abstract subgraph intermediate representation
  */
 typedef struct memory_block
 {
-    uint16_t  index;           //!< the index of a memory_block
-    uint32_t  size;            //!< final estimated memory size
-    uint16_t  tensor_count;    //!< referenced tensor count
-    uint16_t* tensor_list;     //!< referenced tensor list
-    uint16_t  tensor_index;    //!< referenced tensor index, which is largest one
-    uint8_t   inuse;           //!< flag mark if this block is inuse
+    uint16_t index;        //!< the index of a memory_block
+    uint32_t size;         //!< final estimated memory size
+    uint16_t tensor_count; //!< referenced tensor count
+    uint16_t* tensor_list; //!< referenced tensor list
+    uint16_t tensor_index; //!< referenced tensor index, which is largest one
+    uint8_t inuse;         //!< flag mark if this block is inuse
 } memory_block_t;
 
-
 /*!
  * @brief  Init tensor quantization parameter.
  *
@@ -54,7 +52,6 @@ typedef struct memory_block
  */
 void init_memory_block(memory_block_t* memory_block, uint16_t index);
 
-
 /*!
  * @brief  Set tensor quantization parameter.
  *
diff --git a/source/optimizer/helper.c b/source/optimizer/helper.c
index ac2439d68..e325fc22b 100644
--- a/source/optimizer/helper.c
+++ b/source/optimizer/helper.c
@@ -30,7 +30,6 @@
 #include "graph/subgraph.h"
 #include "operator/op.h"
 
-
 int is_index_in_array(const uint16_t* array, const uint16_t array_size, const uint16_t index)
 {
     for (uint16_t i = 0; i < array_size; i++)
@@ -46,19 +45,16 @@ int is_index_in_array(const uint16_t* array, const uint16_t array_size, const ui
     return 0;
 }
 
-
 int is_subgraph_input_tensor(const struct subgraph* subgraph, const uint16_t tensor_index)
 {
     return is_index_in_array(subgraph->input_tensor_list, (uint16_t)subgraph->input_num, tensor_index);
 }
 
-
 int is_subgraph_output_tensor(const struct subgraph* subgraph, const uint16_t tensor_index)
 {
     return is_index_in_array(subgraph->output_tensor_list, (uint16_t)subgraph->input_num, tensor_index);
 }
 
-
 int is_variable_tensor_in_subgraph(const ir_subgraph_t* subgraph, const uint16_t tensor_index)
 {
     // only each node outputs need to be checked next
diff --git a/source/optimizer/helper.h b/source/optimizer/helper.h
index d684c317e..f626ed705 100644
--- a/source/optimizer/helper.h
+++ b/source/optimizer/helper.h
@@ -30,7 +30,6 @@
 struct subgraph;
 struct vector;
 
-
 int is_subgraph_input_tensor(const struct subgraph* subgraph, uint16_t tensor_index);
 
 int is_subgraph_output_tensor(const struct subgraph* subgraph, uint16_t tensor_index);
diff --git a/source/optimizer/split.c b/source/optimizer/split.c
index 2aaafedb9..75004e189 100644
--- a/source/optimizer/split.c
+++ b/source/optimizer/split.c
@@ -39,7 +39,6 @@
 
 #define MODEL_COMPLEX_COUNT 3
 
-
 int check_sub_info(struct graph* ir_graph)
 {
     int subgraph_num = get_vector_num(ir_graph->subgraph_list);
@@ -51,7 +50,6 @@ int check_sub_info(struct graph* ir_graph)
     return -1;
 }
 
-
 int tensor_in_precision(const struct tensor* tensor, struct vector* allowed_precision)
 {
     int count = get_vector_num(allowed_precision);
@@ -67,7 +65,6 @@ int tensor_in_precision(const struct tensor* tensor, struct vector* allowed_prec
     return -1;
 }
 
-
 int node_in_precision(const struct graph* ir_graph, uint16_t node_id, struct vector* allowed_precision)
 {
     if (node_id > ir_graph->node_num)
@@ -100,7 +97,6 @@ int node_in_precision(const struct graph* ir_graph, uint16_t node_id, struct vec
     return -1;
 }
 
-
 int node_in_list(const struct graph* ir_graph, struct vector* ops_list, const uint16_t node_id)
 {
     if (NULL == ir_graph || NULL == ops_list)
@@ -122,7 +118,6 @@ int node_in_list(const struct graph* ir_graph, struct vector* ops_list, const ui
     return -1;
 }
 
-
 struct vector* get_graph_blocked_nodes(const struct graph* ir_graph, struct vector* blocked_ops, struct vector* allowed_precision)
 {
     struct vector* blocked_nodes_list = create_vector(sizeof(uint16_t), NULL);
@@ -141,7 +136,6 @@ struct vector* get_graph_blocked_nodes(const struct graph* ir_graph, struct vect
     return blocked_nodes_list;
 }
 
-
 // policy has some issue, must be fixed
 void split_graph_node_to_sub_graph(struct graph* ir_graph, struct vector* allowed_ops, struct vector* blocked_ops, struct vector* allowed_precision)
 {
@@ -156,7 +150,6 @@ void split_graph_node_to_sub_graph(struct graph* ir_graph, struct vector* allowe
         // scan from back to front
         for (int i = blocked_nodes_count - 1; i >= 0; i--)
         {
-
             // start node id (the blocked one)
             uint16_t first_node_id = *((uint16_t*)get_vector_data(blocked_nodes_list, i));
             // end node id (not including its self; the next blocked one, or the last one)
@@ -186,7 +179,7 @@ void split_graph_node_to_sub_graph(struct graph* ir_graph, struct vector* allowe
                 }
             }
 
-            if (children_nodes_is_complicated < MODEL_COMPLEX_COUNT)   // directly add these nodes to sub graph list
+            if (children_nodes_is_complicated < MODEL_COMPLEX_COUNT) // directly add these nodes to sub graph list
             {
                 struct subgraph* sub_graph = (struct subgraph*)sys_malloc(sizeof(struct subgraph));
                 init_ir_subgraph((struct graph*)ir_graph, sub_graph, 0);
@@ -318,7 +311,6 @@ void split_graph_node_to_sub_graph(struct graph* ir_graph, struct vector* allowe
     }
 }
 
-
 void generate_sub_graph_io(struct graph* ir_graph)
 {
     int sub_graph_count = get_vector_num(ir_graph->subgraph_list);
@@ -541,8 +533,6 @@ void generate_sub_graph_io(struct graph* ir_graph)
     }
 }
 
-
-
 void add_sub_graph_to_ir_graph(struct graph* ir_graph)
 {
     const int sub_graphs_count = get_vector_num(ir_graph->subgraph_list);
@@ -750,7 +740,6 @@ void add_sub_graph_to_ir_graph(struct graph* ir_graph)
     }
 }
 
-
 void dump_sub_graph(struct subgraph* sub_graph)
 {
     TLOG_INFO("Sub graph[%d]: {%8s } has %d nodes, %d input tensors, %d output tensors.\n", sub_graph->index, sub_graph->device->name, sub_graph->node_num, sub_graph->input_num, sub_graph->output_num);
diff --git a/source/optimizer/split.h b/source/optimizer/split.h
index ace1f27f0..1e65a733d 100644
--- a/source/optimizer/split.h
+++ b/source/optimizer/split.h
@@ -28,7 +28,6 @@ struct graph;
 struct subgraph;
 struct vector;
 
-
 int check_sub_info(struct graph* ir_graph);
 
 struct vector* get_graph_blocked_nodes(const struct graph* ir_graph, struct vector* blocked_ops, struct vector* allowed_precision);
diff --git a/source/scheduler/scheduler.c b/source/scheduler/scheduler.c
index fdcb60a56..d352be39e 100644
--- a/source/scheduler/scheduler.c
+++ b/source/scheduler/scheduler.c
@@ -36,7 +36,6 @@
 
 #include <string.h>
 
-
 static int sched_prerun(ir_scheduler_t* scheduler, ir_graph_t* ir_graph)
 {
     int subgraph_num = get_vector_num(ir_graph->subgraph_list);
@@ -71,7 +70,6 @@ static int sched_prerun(ir_scheduler_t* scheduler, ir_graph_t* ir_graph)
     return 0;
 }
 
-
 static int sched_run(ir_scheduler_t* scheduler, ir_graph_t* ir_graph, int block)
 {
     if (block == 0)
@@ -114,7 +112,7 @@ static int sched_run(ir_scheduler_t* scheduler, ir_graph_t* ir_graph, int block)
 
         for (int i = 0; i < wait_num; i++)
         {
-            struct subgraph* subgraph = *( struct subgraph** )get_vector_data(wait_list, i);
+            struct subgraph* subgraph = *(struct subgraph**)get_vector_data(wait_list, i);
 
             if (subgraph->input_ready_count == subgraph->input_wait_count)
                 ready_list[ready_num++] = i;
@@ -128,7 +126,7 @@ static int sched_run(ir_scheduler_t* scheduler, ir_graph_t* ir_graph, int block)
 
         for (int i = 0; i < ready_num; i++)
         {
-            struct subgraph* subgraph = *( struct subgraph** )get_vector_data(wait_list, ready_list[i]);
+            struct subgraph* subgraph = *(struct subgraph**)get_vector_data(wait_list, ready_list[i]);
             ir_device_t* nn_dev = subgraph->device;
 
             subgraph->status = GRAPH_STAT_RUNNING;
@@ -183,13 +181,11 @@ static int sched_run(ir_scheduler_t* scheduler, ir_graph_t* ir_graph, int block)
     return 0;
 }
 
-
 static int sched_wait(ir_scheduler_t* scheduler, ir_graph_t* ir_graph)
 {
     return -1;
 }
 
-
 static int sched_postrun(ir_scheduler_t* scheduler, ir_graph_t* ir_graph)
 {
     int subgraph_num = get_vector_num(ir_graph->subgraph_list);
@@ -216,17 +212,15 @@ static int sched_postrun(ir_scheduler_t* scheduler, ir_graph_t* ir_graph)
         return 0;
 }
 
-
 static ir_scheduler_t sync_scheduler = {
-        .name   = "sync",
-        .prerun = sched_prerun,
-        .run = sched_run,
-        .wait = sched_wait,
-        .postrun = sched_postrun,
-        .release = NULL,
+    .name = "sync",
+    .prerun = sched_prerun,
+    .run = sched_run,
+    .wait = sched_wait,
+    .postrun = sched_postrun,
+    .release = NULL,
 };
 
-
 ir_scheduler_t* find_default_scheduler(void)
 {
     return &sync_scheduler;
diff --git a/source/scheduler/scheduler.h b/source/scheduler/scheduler.h
index cca04c830..3ab8f619e 100644
--- a/source/scheduler/scheduler.h
+++ b/source/scheduler/scheduler.h
@@ -28,7 +28,6 @@
 struct graph;
 struct vector;
 
-
 /*!
  * @struct ir_scheduler_t
  * @brief  Abstract scheduler intermediate representation
@@ -37,14 +36,13 @@ typedef struct scheduler
 {
     const char* name;
 
-    int  (*prerun)(struct scheduler*, struct graph*);
-    int  (*run)(struct scheduler*, struct graph*, int block);
-    int  (*wait)(struct scheduler*, struct graph*);
-    int  (*postrun)(struct scheduler*, struct graph*);
+    int (*prerun)(struct scheduler*, struct graph*);
+    int (*run)(struct scheduler*, struct graph*, int block);
+    int (*wait)(struct scheduler*, struct graph*);
+    int (*postrun)(struct scheduler*, struct graph*);
     void (*release)(struct scheduler*);
 } ir_scheduler_t;
 
-
 /*!
  * @brief  Dump the node.
  *
diff --git a/source/serializer/serializer.c b/source/serializer/serializer.c
index 47c13affe..a6e90b13e 100644
--- a/source/serializer/serializer.c
+++ b/source/serializer/serializer.c
@@ -27,7 +27,6 @@
 
 #include <string.h>
 
-
 void init_serializer(struct serializer* serializer)
 {
     memset(serializer, 0, sizeof(serializer_t));
diff --git a/source/serializer/serializer.h b/source/serializer/serializer.h
index 21c1b1b30..592a803ad 100644
--- a/source/serializer/serializer.h
+++ b/source/serializer/serializer.h
@@ -29,7 +29,6 @@
 
 struct graph;
 
-
 /*!
  * @struct serializer_t
  * @brief  Abstract serializer
@@ -60,7 +59,6 @@ typedef struct serializer
     int (*release)(struct serializer*);
 } serializer_t;
 
-
 /*!
  * @brief Initialize serializer
  *
diff --git a/source/serializer/tmfile/op/tm2_add_n.c b/source/serializer/tmfile/op/tm2_add_n.c
index ba606cc66..441d38ed4 100644
--- a/source/serializer/tmfile/op/tm2_add_n.c
+++ b/source/serializer/tmfile/op/tm2_add_n.c
@@ -31,19 +31,16 @@
 #include "serializer/tmfile/tm2_serializer.h"
 #include "utility/log.h"
 
-
 static int add_n_op_map(int op)
 {
     return OP_ADD_N;
 }
 
-
 static int tm2_load_add_n(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_add_n_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_argmax.c b/source/serializer/tmfile/op/tm2_argmax.c
index 22c6603b3..3bfeb8665 100644
--- a/source/serializer/tmfile/op/tm2_argmax.c
+++ b/source/serializer/tmfile/op/tm2_argmax.c
@@ -34,19 +34,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int argmax_op_map(int op)
 {
     return OP_ARGMAX;
 }
 
-
 static int tm2_load_argmax(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
     struct argmax_param* argmax_param = (struct argmax_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ArgMaxParam* tm_param = ( TM2_ArgMaxParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ArgMaxParam* tm_param = (TM2_ArgMaxParam*)(mem_base + tm_op->offset_t_param);
 
     argmax_param->axis = tm_param->axis;
     argmax_param->keepdims = tm_param->keepdims;
@@ -54,7 +52,6 @@ static int tm2_load_argmax(struct graph* ir_graph, struct node* ir_node, const T
     return 0;
 }
 
-
 int register_tm2_argmax_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_argmax_op()
     return 0;
 }
 
-
 int unregister_tm2_argmax_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_argmin.c b/source/serializer/tmfile/op/tm2_argmin.c
index ab34d8920..782bd9dd3 100644
--- a/source/serializer/tmfile/op/tm2_argmin.c
+++ b/source/serializer/tmfile/op/tm2_argmin.c
@@ -34,19 +34,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int argmin_op_map(int op)
 {
     return OP_ARGMIN;
 }
 
-
 static int tm2_load_argmin(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
-    struct argmin_param* argmin_param = ( struct argmin_param* )ir_node->op.param_mem;
+    struct argmin_param* argmin_param = (struct argmin_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ArgMaxParam* tm_param = ( TM2_ArgMaxParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ArgMaxParam* tm_param = (TM2_ArgMaxParam*)(mem_base + tm_op->offset_t_param);
 
     argmin_param->axis = tm_param->axis;
     argmin_param->keepdims = tm_param->keepdims;
@@ -54,7 +52,6 @@ static int tm2_load_argmin(struct graph* ir_graph, struct node* ir_node, const T
     return 0;
 }
 
-
 int register_tm2_argmin_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_argmin_op()
     return 0;
 }
 
-
 int unregister_tm2_argmin_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_batchnorm.c b/source/serializer/tmfile/op/tm2_batchnorm.c
index 17acd0467..a56447aec 100644
--- a/source/serializer/tmfile/op/tm2_batchnorm.c
+++ b/source/serializer/tmfile/op/tm2_batchnorm.c
@@ -34,19 +34,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int batchnorm_op_map(int op)
 {
     return OP_BATCHNORM;
 }
 
-
 static int tm2_load_batchnorm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
-    struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )ir_node->op.param_mem;
+    struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_BatchNormParam* tm_param = ( TM2_BatchNormParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_BatchNormParam* tm_param = (TM2_BatchNormParam*)(mem_base + tm_op->offset_t_param);
 
     batchnorm_param->rescale_factor = tm_param->rescale_factor;
     batchnorm_param->eps = tm_param->eps;
@@ -55,7 +53,6 @@ static int tm2_load_batchnorm(struct graph* ir_graph, struct node* ir_node, cons
     return 0;
 }
 
-
 int register_tm2_batchnorm_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_batchnorm_op()
     return 0;
 }
 
-
 int unregister_tm2_batchnorm_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_batchtospacend.c b/source/serializer/tmfile/op/tm2_batchtospacend.c
index 2f4077820..dd68d30e3 100644
--- a/source/serializer/tmfile/op/tm2_batchtospacend.c
+++ b/source/serializer/tmfile/op/tm2_batchtospacend.c
@@ -34,19 +34,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int batchtospacend_op_map(int op)
 {
     return OP_BATCHTOSPACEND;
 }
 
-
 static int tm2_load_batchtospacend(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
-    struct batchtospacend_param* batchtospacend_param = ( struct batchtospacend_param* )ir_node->op.param_mem;
+    struct batchtospacend_param* batchtospacend_param = (struct batchtospacend_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_BatchToSpaceNDParam* tm_param = ( TM2_BatchToSpaceNDParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_BatchToSpaceNDParam* tm_param = (TM2_BatchToSpaceNDParam*)(mem_base + tm_op->offset_t_param);
 
     batchtospacend_param->dilation_x = tm_param->dilation_x;
     batchtospacend_param->dilation_y = tm_param->dilation_y;
@@ -58,7 +56,6 @@ static int tm2_load_batchtospacend(struct graph* ir_graph, struct node* ir_node,
     return 0;
 }
 
-
 int register_tm2_batchtospacend_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -74,7 +71,6 @@ int register_tm2_batchtospacend_op()
     return 0;
 }
 
-
 int unregister_tm2_batchtospacend_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_bias.c b/source/serializer/tmfile/op/tm2_bias.c
index 64294cc91..6764fa10b 100644
--- a/source/serializer/tmfile/op/tm2_bias.c
+++ b/source/serializer/tmfile/op/tm2_bias.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int bias_op_map(int op)
 {
     return OP_BIAS;
 }
 
-
 static int tm2_load_bias(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_bias_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_bias_op()
     return 0;
 }
 
-
 int unregister_tm2_bias_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_broadmul.c b/source/serializer/tmfile/op/tm2_broadmul.c
index 6172a2cb4..33df417bc 100644
--- a/source/serializer/tmfile/op/tm2_broadmul.c
+++ b/source/serializer/tmfile/op/tm2_broadmul.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int broadmul_op_map(int op)
 {
     return OP_BROADMUL;
 }
 
-
 static int tm2_load_broadmul(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                              const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_broadmul_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_broadmul_op()
     return 0;
 }
 
-
 int unregister_tm2_broadmul_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_cast.c b/source/serializer/tmfile/op/tm2_cast.c
index 3a256fbf7..19c4fbe80 100644
--- a/source/serializer/tmfile/op/tm2_cast.c
+++ b/source/serializer/tmfile/op/tm2_cast.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int op_map(int op)
 {
     return OP_CAST;
 }
 
-
 static int tm2_load_cast(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
-    struct cast_param* param = ( struct cast_param* )ir_node->op.param_mem;
+    struct cast_param* param = (struct cast_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_CastParam* tm_param = ( TM2_CastParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_CastParam* tm_param = (TM2_CastParam*)(mem_base + tm_op->offset_t_param);
 
     param->type_from = tm_param->type_from;
     param->type_to = tm_param->type_to;
@@ -57,7 +55,6 @@ static int tm2_load_cast(struct graph* ir_graph, struct node* ir_node, const TM2
     return 0;
 }
 
-
 int register_tm2_cast_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -73,7 +70,6 @@ int register_tm2_cast_op()
     return 0;
 }
 
-
 int unregister_tm2_cast_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_ceil.c b/source/serializer/tmfile/op/tm2_ceil.c
index f88e790b5..0d5abe606 100644
--- a/source/serializer/tmfile/op/tm2_ceil.c
+++ b/source/serializer/tmfile/op/tm2_ceil.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int ceil_op_map(int op)
 {
     return OP_CEIL;
 }
 
-
 static int tm2_load_ceil(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_ceil_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_ceil_op()
     return 0;
 }
 
-
 int unregister_tm2_ceil_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_clip.c b/source/serializer/tmfile/op/tm2_clip.c
index 80a54e072..1e0478ff1 100644
--- a/source/serializer/tmfile/op/tm2_clip.c
+++ b/source/serializer/tmfile/op/tm2_clip.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int clip_op_map(int op)
 {
     return OP_CLIP;
 }
 
-
 static int tm2_load_clip(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
-    struct clip_param* clip_param = ( struct clip_param* )ir_node->op.param_mem;
+    struct clip_param* clip_param = (struct clip_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ClipParam* tm_param = ( TM2_ClipParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ClipParam* tm_param = (TM2_ClipParam*)(mem_base + tm_op->offset_t_param);
 
     clip_param->max = tm_param->max;
     clip_param->min = tm_param->min;
@@ -55,7 +53,6 @@ static int tm2_load_clip(struct graph* ir_graph, struct node* ir_node, const TM2
     return 0;
 }
 
-
 int register_tm2_clip_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_clip_op()
     return 0;
 }
 
-
 int unregister_tm2_clip_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_comparison.c b/source/serializer/tmfile/op/tm2_comparison.c
index 3e71ed91a..05220ab5c 100644
--- a/source/serializer/tmfile/op/tm2_comparison.c
+++ b/source/serializer/tmfile/op/tm2_comparison.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int comparison_op_map(int op)
 {
     return OP_COMPARISON;
 }
 
-
 static int tm2_load_comparison(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                const TM2_Operator* tm_op)
 {
-    struct comparison_param* param = ( struct comparison_param* )ir_node->op.param_mem;
+    struct comparison_param* param = (struct comparison_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ComparisonParam* tm_param = ( TM2_ComparisonParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ComparisonParam* tm_param = (TM2_ComparisonParam*)(mem_base + tm_op->offset_t_param);
 
     param->type = tm_param->type;
 
     return 0;
 }
 
-
 int register_tm2_comparison_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_comparison_op()
     return 0;
 }
 
-
 int unregister_tm2_comparison_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_concat.c b/source/serializer/tmfile/op/tm2_concat.c
index 4e7cc6324..44949f093 100644
--- a/source/serializer/tmfile/op/tm2_concat.c
+++ b/source/serializer/tmfile/op/tm2_concat.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int concat_op_map(int op)
 {
     return OP_CONCAT;
 }
 
-
 static int tm2_load_concat(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                            const TM2_Operator* tm_op)
 {
-    struct concat_param* concat_param = ( struct concat_param* )ir_node->op.param_mem;
+    struct concat_param* concat_param = (struct concat_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ConcatParam* tm_param = ( TM2_ConcatParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ConcatParam* tm_param = (TM2_ConcatParam*)(mem_base + tm_op->offset_t_param);
 
     concat_param->axis = tm_param->axis;
 
     return 0;
 }
 
-
 int register_tm2_concat_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_concat_op()
     return 0;
 }
 
-
 int unregister_tm2_concat_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_conv.c b/source/serializer/tmfile/op/tm2_conv.c
index e8aa3144c..8397206ba 100644
--- a/source/serializer/tmfile/op/tm2_conv.c
+++ b/source/serializer/tmfile/op/tm2_conv.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int conv_op_map(int op)
 {
     return OP_CONV;
 }
 
-
 static int tm2_load_conv(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
-    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ConvParam* tm_param = ( TM2_ConvParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ConvParam* tm_param = (TM2_ConvParam*)(mem_base + tm_op->offset_t_param);
 
     conv_param->kernel_h = tm_param->kernel_h;
     conv_param->kernel_w = tm_param->kernel_w;
@@ -85,7 +83,6 @@ static int tm2_load_conv(struct graph* ir_graph, struct node* ir_node, const TM2
     return 0;
 }
 
-
 int register_tm2_conv_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -101,7 +98,6 @@ int register_tm2_conv_op()
     return 0;
 }
 
-
 int unregister_tm2_conv_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_crop.c b/source/serializer/tmfile/op/tm2_crop.c
index ce4b21212..1098bd48d 100644
--- a/source/serializer/tmfile/op/tm2_crop.c
+++ b/source/serializer/tmfile/op/tm2_crop.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int crop_op_map(int op)
 {
     return OP_CROP;
 }
 
-
 static int tm2_load_crop(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
-    struct crop_param* crop_param = ( struct crop_param* )ir_node->op.param_mem;
+    struct crop_param* crop_param = (struct crop_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_CropParam* tm_param = ( TM2_CropParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_CropParam* tm_param = (TM2_CropParam*)(mem_base + tm_op->offset_t_param);
 
     crop_param->num_args = tm_param->num_args;
     crop_param->offset_c = tm_param->offset_c;
@@ -62,7 +60,6 @@ static int tm2_load_crop(struct graph* ir_graph, struct node* ir_node, const TM2
     return 0;
 }
 
-
 int register_tm2_crop_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -78,7 +75,6 @@ int register_tm2_crop_op()
     return 0;
 }
 
-
 int unregister_tm2_crop_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_deconv.c b/source/serializer/tmfile/op/tm2_deconv.c
index 153216caf..286cad77e 100644
--- a/source/serializer/tmfile/op/tm2_deconv.c
+++ b/source/serializer/tmfile/op/tm2_deconv.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int deconv_op_map(int op)
 {
     return OP_DECONV;
 }
 
-
 static int tm2_load_deconv(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                            const TM2_Operator* tm_op)
 {
-    struct deconv_param* deconv_param = ( struct deconv_param* )ir_node->op.param_mem;
+    struct deconv_param* deconv_param = (struct deconv_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_DeconvParam* tm_param = ( TM2_DeconvParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_DeconvParam* tm_param = (TM2_DeconvParam*)(mem_base + tm_op->offset_t_param);
 
     deconv_param->kernel_h = tm_param->kernel_h;
     deconv_param->kernel_w = tm_param->kernel_w;
@@ -64,17 +62,16 @@ static int tm2_load_deconv(struct graph* ir_graph, struct node* ir_node, const T
     deconv_param->dilation_h = tm_param->dilation_h;
     deconv_param->dilation_w = tm_param->dilation_w;
 
-    deconv_param->group = tm_param->group ;
-    deconv_param->num_output = tm_param->num_output ;
-    deconv_param->activation = tm_param->activation ;
-    
+    deconv_param->group = tm_param->group;
+    deconv_param->num_output = tm_param->num_output;
+    deconv_param->activation = tm_param->activation;
+
     deconv_param->output_pad_h0 = tm_param->output_pad_h0;
     deconv_param->output_pad_w0 = tm_param->output_pad_w0;
 
     return 0;
 }
 
-
 int register_tm2_deconv_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -90,7 +87,6 @@ int register_tm2_deconv_op()
     return 0;
 }
 
-
 int unregister_tm2_deconv_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_depthtospace.c b/source/serializer/tmfile/op/tm2_depthtospace.c
index f66248d1a..d7282f275 100644
--- a/source/serializer/tmfile/op/tm2_depthtospace.c
+++ b/source/serializer/tmfile/op/tm2_depthtospace.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int depthtospace_op_map(int op)
 {
     return OP_DEPTHTOSPACE;
 }
 
-
 static int tm2_load_depthtospace(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                  const TM2_Operator* tm_op)
 {
-    struct depthtospace_param* depthtospace_param = ( struct depthtospace_param* )ir_node->op.param_mem;
+    struct depthtospace_param* depthtospace_param = (struct depthtospace_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_DepthToSpaceParam* tm_param = ( TM2_DepthToSpaceParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_DepthToSpaceParam* tm_param = (TM2_DepthToSpaceParam*)(mem_base + tm_op->offset_t_param);
 
     depthtospace_param->block_size = tm_param->block_size;
 
     return 0;
 }
 
-
 int register_tm2_depthtospace_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_depthtospace_op()
     return 0;
 }
 
-
 int unregister_tm2_depthtospace_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_detection_output.c b/source/serializer/tmfile/op/tm2_detection_output.c
index 152c8a1a6..50e1edd4c 100644
--- a/source/serializer/tmfile/op/tm2_detection_output.c
+++ b/source/serializer/tmfile/op/tm2_detection_output.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int detection_op_map(int op)
 {
     return OP_DETECTION_OUTPUT;
 }
 
-
 static int tm2_load_detection(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                               const TM2_Operator* tm_op)
 {
-    struct detection_output_param* detection_output_param = ( struct detection_output_param* )ir_node->op.param_mem;
+    struct detection_output_param* detection_output_param = (struct detection_output_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_DetectionOutputParam* tm_param = ( TM2_DetectionOutputParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_DetectionOutputParam* tm_param = (TM2_DetectionOutputParam*)(mem_base + tm_op->offset_t_param);
 
     detection_output_param->num_classes = tm_param->num_classes;
     detection_output_param->keep_top_k = tm_param->keep_top_k;
@@ -58,7 +56,6 @@ static int tm2_load_detection(struct graph* ir_graph, struct node* ir_node, cons
     return 0;
 }
 
-
 int register_tm2_detection_output_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -74,7 +71,6 @@ int register_tm2_detection_output_op()
     return 0;
 }
 
-
 int unregister_tm2_detection_output_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_detection_postprocess.c b/source/serializer/tmfile/op/tm2_detection_postprocess.c
index 371554288..0a06c2698 100644
--- a/source/serializer/tmfile/op/tm2_detection_postprocess.c
+++ b/source/serializer/tmfile/op/tm2_detection_postprocess.c
@@ -35,22 +35,18 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int detection_postprocess_op_map(int op)
 {
     return OP_DETECTION_POSTPROCESS;
 }
 
-
 static int tm2_load_detection_postprocess(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                           const TM2_Operator* tm_op)
 {
-    struct detection_postprocess_param* detection_postprocess_param =
-        ( struct detection_postprocess_param* )ir_node->op.param_mem;
+    struct detection_postprocess_param* detection_postprocess_param = (struct detection_postprocess_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_DetectionPostProcessParam* tm_param =
-        ( TM2_DetectionPostProcessParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_DetectionPostProcessParam* tm_param = (TM2_DetectionPostProcessParam*)(mem_base + tm_op->offset_t_param);
 
     detection_postprocess_param->max_detections = tm_param->max_detections;
     detection_postprocess_param->max_classes_per_detection = tm_param->max_classes_per_detection;
@@ -62,13 +58,12 @@ static int tm2_load_detection_postprocess(struct graph* ir_graph, struct node* i
     detection_postprocess_param->scales = (float*)sys_malloc(vf_scales->v_num * sizeof(float));
 
     for (unsigned int i = 0; i < vf_scales->v_num;
-         i++)    // TODO : need to check v_num .Next called in run function(detection_postprocess) default as 4 ?
+         i++) // TODO : need to check v_num .Next called in run function(detection_postprocess) default as 4 ?
         detection_postprocess_param->scales[i] = vf_scales->data[i];
 
     return 0;
 }
 
-
 int register_tm2_detection_postprocess_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -85,7 +80,6 @@ int register_tm2_detection_postprocess_op()
     return 0;
 }
 
-
 int unregister_tm2_detection_postprocess_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_dropout.c b/source/serializer/tmfile/op/tm2_dropout.c
index 1efea5674..3faeb9907 100644
--- a/source/serializer/tmfile/op/tm2_dropout.c
+++ b/source/serializer/tmfile/op/tm2_dropout.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int dropout_op_map(int op)
 {
     return OP_DROPOUT;
 }
 
-
 static int tm2_load_dropout(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_dropout_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_dropout_op()
     return 0;
 }
 
-
 int unregister_tm2_dropout_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_eltwise.c b/source/serializer/tmfile/op/tm2_eltwise.c
index 7c2b168d7..274e711dd 100644
--- a/source/serializer/tmfile/op/tm2_eltwise.c
+++ b/source/serializer/tmfile/op/tm2_eltwise.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int eltwise_op_map(int op)
 {
     return OP_ELTWISE;
 }
 
-
 static int tm2_load_eltwise(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
-    struct eltwise_param* eltwise_param = ( struct eltwise_param* )ir_node->op.param_mem;
+    struct eltwise_param* eltwise_param = (struct eltwise_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_EltwiseParam* tm_param = ( TM2_EltwiseParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_EltwiseParam* tm_param = (TM2_EltwiseParam*)(mem_base + tm_op->offset_t_param);
 
     eltwise_param->type = tm_param->type;
     eltwise_param->caffe_flavor = tm_param->caffe_flavor;
@@ -58,7 +56,6 @@ static int tm2_load_eltwise(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 int register_tm2_eltwise_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -74,7 +71,6 @@ int register_tm2_eltwise_op()
     return 0;
 }
 
-
 int unregister_tm2_eltwise_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_elu.c b/source/serializer/tmfile/op/tm2_elu.c
index 1cb72d337..5a4147542 100644
--- a/source/serializer/tmfile/op/tm2_elu.c
+++ b/source/serializer/tmfile/op/tm2_elu.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int elu_op_map(int op)
 {
     return OP_ELU;
 }
 
-
 static int tm2_load_elu(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                         const TM2_Operator* tm_op)
 {
-    struct elu_param* param = ( struct elu_param* )ir_node->op.param_mem;
+    struct elu_param* param = (struct elu_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_EluParam* tm_param = ( TM2_EluParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_EluParam* tm_param = (TM2_EluParam*)(mem_base + tm_op->offset_t_param);
 
     param->alpha = tm_param->alpha;
 
     return 0;
 }
 
-
 int register_tm2_elu_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_elu_op()
     return 0;
 }
 
-
 int unregister_tm2_elu_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_embedding.c b/source/serializer/tmfile/op/tm2_embedding.c
index f91165d53..0cb838dab 100644
--- a/source/serializer/tmfile/op/tm2_embedding.c
+++ b/source/serializer/tmfile/op/tm2_embedding.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int gather_op_map(int op)
 {
     return OP_EMBEDDING;
 }
 
-
 static int tm2_load_embedding(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                               const TM2_Operator* tm_op)
 {
-    struct embedding_param* gather_param = ( struct embedding_param* )ir_node->op.param_mem;
+    struct embedding_param* gather_param = (struct embedding_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_EmbedParam* tm_param = ( TM2_EmbedParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_EmbedParam* tm_param = (TM2_EmbedParam*)(mem_base + tm_op->offset_t_param);
 
     // gather_param->bias_term = tm_param->bias_term;
     gather_param->input_dim = tm_param->input_dim;
@@ -57,7 +55,6 @@ static int tm2_load_embedding(struct graph* ir_graph, struct node* ir_node, cons
     return 0;
 }
 
-
 int register_tm2_embedding_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -73,7 +70,6 @@ int register_tm2_embedding_op()
     return 0;
 }
 
-
 int unregister_tm2_embedding_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_expand.c b/source/serializer/tmfile/op/tm2_expand.c
index 441d0671e..11090dc18 100644
--- a/source/serializer/tmfile/op/tm2_expand.c
+++ b/source/serializer/tmfile/op/tm2_expand.c
@@ -35,29 +35,26 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int expand_op_map(int op)
 {
     return OP_EXPAND;
 }
 
-
 static int tm2_load_expand(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
-                            const TM2_Operator* tm_op)
+                           const TM2_Operator* tm_op)
 {
-    struct expand_param* param = ( struct expand_param* )ir_node->op.param_mem;
+    struct expand_param* param = (struct expand_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ExpandParam* tm_param = ( TM2_ExpandParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ExpandParam* tm_param = (TM2_ExpandParam*)(mem_base + tm_op->offset_t_param);
     if (tm_param->offset_ex_shape != TM2_NOT_SET)
     {
-        const TM2_Vector_dims* v_ex_shape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_ex_shape);
-        param->ex_shape = ( int* )sys_malloc(v_ex_shape->v_num * sizeof(int));
+        const TM2_Vector_dims* v_ex_shape = (TM2_Vector_dims*)(mem_base + tm_param->offset_ex_shape);
+        param->ex_shape = (int*)sys_malloc(v_ex_shape->v_num * sizeof(int));
 
         for (unsigned int i = 0; i < v_ex_shape->v_num; i++)
         {
             param->ex_shape[i] = v_ex_shape->dims[i];
-            
         }
     }
     param->dim_num = tm_param->dim_num;
@@ -65,7 +62,6 @@ static int tm2_load_expand(struct graph* ir_graph, struct node* ir_node, const T
     return 0;
 }
 
-
 int register_tm2_expand_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -81,7 +77,6 @@ int register_tm2_expand_op()
     return 0;
 }
 
-
 int unregister_tm2_expand_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_expanddims.c b/source/serializer/tmfile/op/tm2_expanddims.c
index d0a8a7a6f..6826d62a4 100644
--- a/source/serializer/tmfile/op/tm2_expanddims.c
+++ b/source/serializer/tmfile/op/tm2_expanddims.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int expanddims_op_map(int op)
 {
     return OP_EXPANDDIMS;
 }
 
-
 static int tm2_load_expanddims(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                const TM2_Operator* tm_op)
 {
-    struct expanddims_param* expanddims_param = ( struct expanddims_param* )ir_node->op.param_mem;
+    struct expanddims_param* expanddims_param = (struct expanddims_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ExpanddimsParam* tm_param = ( TM2_ExpanddimsParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ExpanddimsParam* tm_param = (TM2_ExpanddimsParam*)(mem_base + tm_op->offset_t_param);
 
     expanddims_param->axis = tm_param->axis;
 
     return 0;
 }
 
-
 int register_tm2_expanddims_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_expanddims_op()
     return 0;
 }
 
-
 int unregister_tm2_expanddims_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_fc.c b/source/serializer/tmfile/op/tm2_fc.c
index f61d49277..6a8920bf9 100644
--- a/source/serializer/tmfile/op/tm2_fc.c
+++ b/source/serializer/tmfile/op/tm2_fc.c
@@ -34,29 +34,26 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int fc_op_map(int op)
 {
     return OP_FC;
 }
 
-
 static int tm2_load_fc(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                        const TM2_Operator* tm_op)
 {
     /* todo: using new TM2 model definition*/
     /* TODO: get input_channel from tm_param */
-    struct fc_param* fc_param = ( struct fc_param* )ir_node->op.param_mem;
+    struct fc_param* fc_param = (struct fc_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_FCParam* tm_param = ( TM2_FCParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_FCParam* tm_param = (TM2_FCParam*)(mem_base + tm_op->offset_t_param);
 
     fc_param->num_output = tm_param->num_output;
 
     return 0;
 }
 
-
 int register_tm2_fc_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -72,7 +69,6 @@ int register_tm2_fc_op()
     return 0;
 }
 
-
 int unregister_tm2_fc_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_flatten.c b/source/serializer/tmfile/op/tm2_flatten.c
index 7401139ca..9a95a0c40 100644
--- a/source/serializer/tmfile/op/tm2_flatten.c
+++ b/source/serializer/tmfile/op/tm2_flatten.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int flatten_op_map(int op)
 {
     return OP_FLATTEN;
 }
 
-
 static int tm2_load_flatten(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
-    struct flatten_param* flatten_param = ( struct flatten_param* )ir_node->op.param_mem;
+    struct flatten_param* flatten_param = (struct flatten_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_FlattenParam* tm_param = ( TM2_FlattenParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_FlattenParam* tm_param = (TM2_FlattenParam*)(mem_base + tm_op->offset_t_param);
 
     flatten_param->end_axis = tm_param->end_axis;
     flatten_param->axis = tm_param->axis;
@@ -55,7 +53,6 @@ static int tm2_load_flatten(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 int register_tm2_flatten_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_flatten_op()
     return 0;
 }
 
-
 int unregister_tm2_flatten_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_gather.c b/source/serializer/tmfile/op/tm2_gather.c
index 092fabf0d..6f63b532c 100644
--- a/source/serializer/tmfile/op/tm2_gather.c
+++ b/source/serializer/tmfile/op/tm2_gather.c
@@ -34,24 +34,22 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int gather_op_map(int op)
 {
     return OP_GATHER;
 }
 
-
 static int tm2_load_gather(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                            const TM2_Operator* tm_op)
 {
-    struct gather_param* gather_param = ( struct gather_param* )ir_node->op.param_mem;
+    struct gather_param* gather_param = (struct gather_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_GatherParam* tm_param = ( TM2_GatherParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_GatherParam* tm_param = (TM2_GatherParam*)(mem_base + tm_op->offset_t_param);
 
-	gather_param->axis = tm_param->axis;
-	gather_param->indices_num = tm_param->indices_num ;
-	if(tm_param->is_onnx)
+    gather_param->axis = tm_param->axis;
+    gather_param->indices_num = tm_param->indices_num;
+    if (tm_param->is_onnx)
         gather_param->is_onnx = true;
     else
         gather_param->is_onnx = false;
@@ -59,7 +57,6 @@ static int tm2_load_gather(struct graph* ir_graph, struct node* ir_node, const T
     return 0;
 }
 
-
 int register_tm2_gather_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -75,7 +72,6 @@ int register_tm2_gather_op()
     return 0;
 }
 
-
 int unregister_tm2_gather_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_gemm.c b/source/serializer/tmfile/op/tm2_gemm.c
index 574f6cf02..491e4ccfa 100644
--- a/source/serializer/tmfile/op/tm2_gemm.c
+++ b/source/serializer/tmfile/op/tm2_gemm.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int gemm_op_map(int op)
 {
     return OP_GEMM;
 }
 
-
 static int tm2_load_gemm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
-    struct gemm_param* gemm_param = ( struct gemm_param* )ir_node->op.param_mem;
+    struct gemm_param* gemm_param = (struct gemm_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_GemmParam* tm_param = ( TM2_GemmParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_GemmParam* tm_param = (TM2_GemmParam*)(mem_base + tm_op->offset_t_param);
 
     gemm_param->alpha = tm_param->alpha;
     gemm_param->beta = tm_param->beta;
@@ -57,7 +55,6 @@ static int tm2_load_gemm(struct graph* ir_graph, struct node* ir_node, const TM2
     return 0;
 }
 
-
 int register_tm2_gemm_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -73,7 +70,6 @@ int register_tm2_gemm_op()
     return 0;
 }
 
-
 int unregister_tm2_gemm_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_generic.c b/source/serializer/tmfile/op/tm2_generic.c
index 5cdf3ebe3..0258d2e2c 100644
--- a/source/serializer/tmfile/op/tm2_generic.c
+++ b/source/serializer/tmfile/op/tm2_generic.c
@@ -34,29 +34,26 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int generic_op_map(int op)
 {
     return OP_GENERIC;
 }
 
-
 static int tm2_load_generic(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
-    struct generic_param* generic_param = ( struct generic_param* )ir_node->op.param_mem;
+    struct generic_param* generic_param = (struct generic_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_GenericParam* tm_param = ( TM2_GenericParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_GenericParam* tm_param = (TM2_GenericParam*)(mem_base + tm_op->offset_t_param);
 
     generic_param->max_input_num = tm_param->max_input_num;
     generic_param->max_output_num = tm_param->max_output_num;
-    generic_param->op_name = ( char* )&tm_param->offset_s_opname;    // TODO: Need to check .
+    generic_param->op_name = (char*)&tm_param->offset_s_opname; // TODO: Need to check .
 
     return 0;
 }
 
-
 int register_tm2_generic_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -72,7 +69,6 @@ int register_tm2_generic_op()
     return 0;
 }
 
-
 int unregister_tm2_generic_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_gru.c b/source/serializer/tmfile/op/tm2_gru.c
index f34cc0533..09db3c174 100644
--- a/source/serializer/tmfile/op/tm2_gru.c
+++ b/source/serializer/tmfile/op/tm2_gru.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int gru_op_map(int op)
 {
     return OP_GRU;
 }
 
-
 static int tm2_load_gru(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                         const TM2_Operator* tm_op)
 {
-    struct gru_param* gru_param = ( struct gru_param* )ir_node->op.param_mem;
+    struct gru_param* gru_param = (struct gru_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_GRUParam* tm_param = ( TM2_GRUParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_GRUParam* tm_param = (TM2_GRUParam*)(mem_base + tm_op->offset_t_param);
 
     gru_param->clip = tm_param->clip;
     gru_param->output_len = tm_param->output_len;
@@ -63,7 +61,6 @@ static int tm2_load_gru(struct graph* ir_graph, struct node* ir_node, const TM2_
     return 0;
 }
 
-
 int register_tm2_gru_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -79,7 +76,6 @@ int register_tm2_gru_op()
     return 0;
 }
 
-
 int unregister_tm2_gru_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_hardsigmoid.c b/source/serializer/tmfile/op/tm2_hardsigmoid.c
index 0dcb40dcc..c7e681845 100644
--- a/source/serializer/tmfile/op/tm2_hardsigmoid.c
+++ b/source/serializer/tmfile/op/tm2_hardsigmoid.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int gather_op_map(int op)
 {
     return OP_HARDSIGMOID;
 }
 
-
 static int tm2_load_hard_sigmoid(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                  const TM2_Operator* tm_op)
 {
-    struct hard_sigmoid_param* gather_param = ( struct hard_sigmoid_param* )ir_node->op.param_mem;
+    struct hard_sigmoid_param* gather_param = (struct hard_sigmoid_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_HardsigmoidParam* tm_param = ( TM2_HardsigmoidParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_HardsigmoidParam* tm_param = (TM2_HardsigmoidParam*)(mem_base + tm_op->offset_t_param);
 
     gather_param->alpha = tm_param->alpha;
     gather_param->beta = tm_param->beta;
@@ -55,7 +53,6 @@ static int tm2_load_hard_sigmoid(struct graph* ir_graph, struct node* ir_node, c
     return 0;
 }
 
-
 int register_tm2_hardsigmoid_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_hardsigmoid_op()
     return 0;
 }
 
-
 int unregister_tm2_hardsigmoid_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_hardswish.c b/source/serializer/tmfile/op/tm2_hardswish.c
index 03ebf3260..4f42636be 100644
--- a/source/serializer/tmfile/op/tm2_hardswish.c
+++ b/source/serializer/tmfile/op/tm2_hardswish.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int gather_op_map(int op)
 {
     return OP_HARDSWISH;
 }
 
-
 static int tm2_load_hardswish(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                               const TM2_Operator* tm_op)
 {
-    struct hardswish_param* gather_param = ( struct hardswish_param* )ir_node->op.param_mem;
+    struct hardswish_param* gather_param = (struct hardswish_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_HardSwishParam* tm_param = ( TM2_HardSwishParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_HardSwishParam* tm_param = (TM2_HardSwishParam*)(mem_base + tm_op->offset_t_param);
 
     gather_param->alpha = tm_param->alpha;
     gather_param->beta = tm_param->beta;
@@ -55,7 +53,6 @@ static int tm2_load_hardswish(struct graph* ir_graph, struct node* ir_node, cons
     return 0;
 }
 
-
 int register_tm2_hardswish_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_hardswish_op()
     return 0;
 }
 
-
 int unregister_tm2_hardswish_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_instancenorm.c b/source/serializer/tmfile/op/tm2_instancenorm.c
index 6f9c17bf4..526987c59 100644
--- a/source/serializer/tmfile/op/tm2_instancenorm.c
+++ b/source/serializer/tmfile/op/tm2_instancenorm.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int instancenorm_op_map(int op)
 {
     return OP_INSTANCENORM;
 }
 
-
 static int tm2_load_instancenorm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                  const TM2_Operator* tm_op)
 {
-    struct instancenorm_Param* gather_param = ( struct instancenorm_Param* )ir_node->op.param_mem;
+    struct instancenorm_Param* gather_param = (struct instancenorm_Param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_InstanceNormParam* tm_param = ( TM2_InstanceNormParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_InstanceNormParam* tm_param = (TM2_InstanceNormParam*)(mem_base + tm_op->offset_t_param);
 
     gather_param->eps = tm_param->eps;
 
     return 0;
 }
 
-
 int register_tm2_instancenorm_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_instancenorm_op()
     return 0;
 }
 
-
 int unregister_tm2_instancenorm_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_interp.c b/source/serializer/tmfile/op/tm2_interp.c
index aea2786ee..992a0c069 100644
--- a/source/serializer/tmfile/op/tm2_interp.c
+++ b/source/serializer/tmfile/op/tm2_interp.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int interp_op_map(int op)
 {
     return OP_INTERP;
 }
 
-
 static int tm2_load_interp(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                            const TM2_Operator* tm_op)
 {
-    struct interp_param* param = ( struct interp_param* )ir_node->op.param_mem;
+    struct interp_param* param = (struct interp_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_InterpParam* tm_param = ( TM2_InterpParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_InterpParam* tm_param = (TM2_InterpParam*)(mem_base + tm_op->offset_t_param);
 
     param->resize_type = tm_param->resize_type;
     param->width_scale = tm_param->width_scale;
@@ -58,7 +56,6 @@ static int tm2_load_interp(struct graph* ir_graph, struct node* ir_node, const T
     return 0;
 }
 
-
 int register_tm2_interp_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -74,7 +71,6 @@ int register_tm2_interp_op()
     return 0;
 }
 
-
 int unregister_tm2_interp_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_l2normalization.c b/source/serializer/tmfile/op/tm2_l2normalization.c
index 94c777c12..52727ffd4 100644
--- a/source/serializer/tmfile/op/tm2_l2normalization.c
+++ b/source/serializer/tmfile/op/tm2_l2normalization.c
@@ -32,19 +32,16 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int l2normalization_op_map(int op)
 {
     return OP_L2NORMALIZATION;
 }
 
-
 static int tm2_load_l2normalization(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_l2normalization_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -60,7 +57,6 @@ int register_tm2_l2normalization_op()
     return 0;
 }
 
-
 int unregister_tm2_l2normalization_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_l2pool.c b/source/serializer/tmfile/op/tm2_l2pool.c
index 1ff98e4e4..e569383f3 100644
--- a/source/serializer/tmfile/op/tm2_l2pool.c
+++ b/source/serializer/tmfile/op/tm2_l2pool.c
@@ -34,15 +34,13 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int l2pool_op_map(int op)
 {
     return OP_L2POOL;
 }
 
-
 static int tm2_load_l2pool(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
-                          const TM2_Operator* tm_op)
+                           const TM2_Operator* tm_op)
 {
     struct l2pool_param* l2pool_param = (struct l2pool_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
@@ -56,7 +54,6 @@ static int tm2_load_l2pool(struct graph* ir_graph, struct node* ir_node, const T
     return 0;
 }
 
-
 int register_tm2_l2pool_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -72,7 +69,6 @@ int register_tm2_l2pool_op()
     return 0;
 }
 
-
 int unregister_tm2_l2pool_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_logical.c b/source/serializer/tmfile/op/tm2_logical.c
index d6fd3079f..ad2b77f59 100644
--- a/source/serializer/tmfile/op/tm2_logical.c
+++ b/source/serializer/tmfile/op/tm2_logical.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int logical_op_map(int op)
 {
     return OP_LOGICAL;
 }
 
-
 static int tm2_load_logical(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
-    struct logical_param* logical_param = ( struct logical_param* )ir_node->op.param_mem;
+    struct logical_param* logical_param = (struct logical_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_LogicalParam* tm_param = ( TM2_LogicalParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_LogicalParam* tm_param = (TM2_LogicalParam*)(mem_base + tm_op->offset_t_param);
 
     logical_param->type = tm_param->type;
 
     return 0;
 }
 
-
 int register_tm2_logical_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_logical_op()
     return 0;
 }
 
-
 int unregister_tm2_logical_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_logistic.c b/source/serializer/tmfile/op/tm2_logistic.c
index b1a585d20..b5a815db4 100644
--- a/source/serializer/tmfile/op/tm2_logistic.c
+++ b/source/serializer/tmfile/op/tm2_logistic.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int logistic_op_map(int op)
 {
     return OP_LOGISTIC;
 }
 
-
 static int tm2_load_logistic(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                              const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_logistic_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_logistic_op()
     return 0;
 }
 
-
 int unregister_tm2_logistic_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_logsoftmax.c b/source/serializer/tmfile/op/tm2_logsoftmax.c
index 0dcc23c2b..afc0d0303 100644
--- a/source/serializer/tmfile/op/tm2_logsoftmax.c
+++ b/source/serializer/tmfile/op/tm2_logsoftmax.c
@@ -34,13 +34,11 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int logsoftmax_op_map(int op)
 {
     return OP_LOGSOFTMAX;
 }
 
-
 static int tm2_load_logsoftmax(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
     struct logsoftmax_param* logsoftmax_param = (struct logsoftmax_param*)ir_node->op.param_mem;
@@ -53,7 +51,6 @@ static int tm2_load_logsoftmax(struct graph* ir_graph, struct node* ir_node, con
     return 0;
 }
 
-
 int register_tm2_logsoftmax_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -69,7 +66,6 @@ int register_tm2_logsoftmax_op()
     return 0;
 }
 
-
 int unregister_tm2_logsoftmax_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_lrn.c b/source/serializer/tmfile/op/tm2_lrn.c
index 9a8a463a3..f5b536ac7 100644
--- a/source/serializer/tmfile/op/tm2_lrn.c
+++ b/source/serializer/tmfile/op/tm2_lrn.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int lrn_op_map(int op)
 {
     return OP_LRN;
 }
 
-
 static int tm2_load_lrn(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                         const TM2_Operator* tm_op)
 {
-    struct lrn_param* lrn_param = ( struct lrn_param* )ir_node->op.param_mem;
+    struct lrn_param* lrn_param = (struct lrn_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_LRNParam* tm_param = ( TM2_LRNParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_LRNParam* tm_param = (TM2_LRNParam*)(mem_base + tm_op->offset_t_param);
 
     lrn_param->local_size = tm_param->local_size;
     lrn_param->alpha = tm_param->alpha;
@@ -58,7 +56,6 @@ static int tm2_load_lrn(struct graph* ir_graph, struct node* ir_node, const TM2_
     return 0;
 }
 
-
 int register_tm2_lrn_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -74,7 +71,6 @@ int register_tm2_lrn_op()
     return 0;
 }
 
-
 int unregister_tm2_lrn_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_lstm.c b/source/serializer/tmfile/op/tm2_lstm.c
index f8143802a..cdd982998 100644
--- a/source/serializer/tmfile/op/tm2_lstm.c
+++ b/source/serializer/tmfile/op/tm2_lstm.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int lstm_op_map(int op)
 {
     return OP_LSTM;
 }
 
-
 static int tm2_load_lstm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
-    struct lstm_param* lstm_param = ( struct lstm_param* )ir_node->op.param_mem;
+    struct lstm_param* lstm_param = (struct lstm_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_LstmParam* tm_param = ( TM2_LstmParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_LstmParam* tm_param = (TM2_LstmParam*)(mem_base + tm_op->offset_t_param);
 
     lstm_param->forget_bias = tm_param->forget_bias;
     lstm_param->clip = tm_param->clip;
@@ -71,7 +69,6 @@ static int tm2_load_lstm(struct graph* ir_graph, struct node* ir_node, const TM2
     return 0;
 }
 
-
 int register_tm2_lstm_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -87,7 +84,6 @@ int register_tm2_lstm_op()
     return 0;
 }
 
-
 int unregister_tm2_lstm_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_matmul.c b/source/serializer/tmfile/op/tm2_matmul.c
index 3ccb1221d..11efabd7e 100644
--- a/source/serializer/tmfile/op/tm2_matmul.c
+++ b/source/serializer/tmfile/op/tm2_matmul.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int matmul_op_map(int op)
 {
     return OP_MATMUL;
 }
 
-
 static int tm2_load_matmul(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                            const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_matmul_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_matmul_op()
     return 0;
 }
 
-
 int unregister_tm2_matmul_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_maximum.c b/source/serializer/tmfile/op/tm2_maximum.c
index 4319e73f3..ca19f22e9 100644
--- a/source/serializer/tmfile/op/tm2_maximum.c
+++ b/source/serializer/tmfile/op/tm2_maximum.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int max_op_map(int op)
 {
     return OP_MAXIMUM;
 }
 
-
 static int tm2_load_max(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
-                            const TM2_Operator* tm_op)
+                        const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_maximum_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_maximum_op()
     return 0;
 }
 
-
 int unregister_tm2_maximum_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_mean.c b/source/serializer/tmfile/op/tm2_mean.c
index d6e7888d0..9fa66927c 100644
--- a/source/serializer/tmfile/op/tm2_mean.c
+++ b/source/serializer/tmfile/op/tm2_mean.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int mean_op_map(int op)
 {
     return OP_MEAN;
 }
 
-
 static int tm2_load_mean(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_mean_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_mean_op()
     return 0;
 }
 
-
 int unregister_tm2_mean_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_mish.c b/source/serializer/tmfile/op/tm2_mish.c
index 986169940..a3a46d932 100644
--- a/source/serializer/tmfile/op/tm2_mish.c
+++ b/source/serializer/tmfile/op/tm2_mish.c
@@ -32,19 +32,16 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int mish_op_map(int op)
 {
     return OP_MISH;
 }
 
-
 static int tm2_load_mish(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_mish_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -60,7 +57,6 @@ int register_tm2_mish_op()
     return 0;
 }
 
-
 int unregister_tm2_mish_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_mvn.c b/source/serializer/tmfile/op/tm2_mvn.c
index 49383d5b0..4d8435dad 100644
--- a/source/serializer/tmfile/op/tm2_mvn.c
+++ b/source/serializer/tmfile/op/tm2_mvn.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int mvn_op_map(int op)
 {
     return OP_MVN;
 }
 
-
 static int tm2_load_mvn(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                         const TM2_Operator* tm_op)
 {
-    struct mvn_param* gather_param = ( struct mvn_param* )ir_node->op.param_mem;
+    struct mvn_param* gather_param = (struct mvn_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_MVNParam* tm_param = ( TM2_MVNParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_MVNParam* tm_param = (TM2_MVNParam*)(mem_base + tm_op->offset_t_param);
 
     gather_param->across_channels = tm_param->across_channels;
     gather_param->eps = tm_param->eps;
@@ -56,7 +54,6 @@ static int tm2_load_mvn(struct graph* ir_graph, struct node* ir_node, const TM2_
     return 0;
 }
 
-
 int register_tm2_mvn_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -72,7 +69,6 @@ int register_tm2_mvn_op()
     return 0;
 }
 
-
 int unregister_tm2_mvn_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_noop.c b/source/serializer/tmfile/op/tm2_noop.c
index 342ee76f6..e033835e3 100644
--- a/source/serializer/tmfile/op/tm2_noop.c
+++ b/source/serializer/tmfile/op/tm2_noop.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int noop_op_map(int op)
 {
     return OP_NOOP;
 }
 
-
 static int tm2_load_noop(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_noop_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_noop_op()
     return 0;
 }
 
-
 int unregister_tm2_noop_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_normalize.c b/source/serializer/tmfile/op/tm2_normalize.c
index 349052c69..a533212ed 100644
--- a/source/serializer/tmfile/op/tm2_normalize.c
+++ b/source/serializer/tmfile/op/tm2_normalize.c
@@ -34,19 +34,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int normalize_op_map(int op)
 {
     return OP_NORMALIZE;
 }
 
-
 static int tm2_load_normalize(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
-    struct normalize_param* normalize_param = ( struct normalize_param* )ir_node->op.param_mem;
+    struct normalize_param* normalize_param = (struct normalize_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_NormalizeParam* tm_param = ( TM2_NormalizeParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_NormalizeParam* tm_param = (TM2_NormalizeParam*)(mem_base + tm_op->offset_t_param);
 
     normalize_param->across_spatial = tm_param->across_spatial;
     normalize_param->channel_shared = tm_param->channel_shared;
@@ -54,7 +52,6 @@ static int tm2_load_normalize(struct graph* ir_graph, struct node* ir_node, cons
     return 0;
 }
 
-
 int register_tm2_normalize_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_normalize_op()
     return 0;
 }
 
-
 int unregister_tm2_normalize_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_pad.c b/source/serializer/tmfile/op/tm2_pad.c
index 6bcf48d39..1c6b36ce6 100644
--- a/source/serializer/tmfile/op/tm2_pad.c
+++ b/source/serializer/tmfile/op/tm2_pad.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int pad_op_map(int op)
 {
     return OP_PAD;
 }
 
-
 static int tm2_load_pad(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                         const TM2_Operator* tm_op)
 {
-    struct pad_param* pad_param = ( struct pad_param* )ir_node->op.param_mem;
+    struct pad_param* pad_param = (struct pad_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_PadParam* tm_param = ( TM2_PadParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_PadParam* tm_param = (TM2_PadParam*)(mem_base + tm_op->offset_t_param);
 
     pad_param->mode = tm_param->mode;
     pad_param->value = tm_param->value;
@@ -63,7 +61,6 @@ static int tm2_load_pad(struct graph* ir_graph, struct node* ir_node, const TM2_
     return 0;
 }
 
-
 int register_tm2_pad_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -79,7 +76,6 @@ int register_tm2_pad_op()
     return 0;
 }
 
-
 int unregister_tm2_pad_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_permute.c b/source/serializer/tmfile/op/tm2_permute.c
index 7ed874ffe..d51fd8681 100644
--- a/source/serializer/tmfile/op/tm2_permute.c
+++ b/source/serializer/tmfile/op/tm2_permute.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int permute_op_map(int op)
 {
     return OP_PERMUTE;
 }
 
-
 static int tm2_load_permute(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
-    struct permute_param* permute_param = ( struct permute_param* )ir_node->op.param_mem;
+    struct permute_param* permute_param = (struct permute_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_PermuteParam* tm_param = ( TM2_PermuteParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_PermuteParam* tm_param = (TM2_PermuteParam*)(mem_base + tm_op->offset_t_param);
 
     permute_param->flag = tm_param->flag;
     permute_param->order0 = tm_param->order0;
@@ -58,7 +56,6 @@ static int tm2_load_permute(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 int register_tm2_permute_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -74,7 +71,6 @@ int register_tm2_permute_op()
     return 0;
 }
 
-
 int unregister_tm2_permute_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_pool.c b/source/serializer/tmfile/op/tm2_pool.c
index 572bf5cf6..fb08f6b2b 100644
--- a/source/serializer/tmfile/op/tm2_pool.c
+++ b/source/serializer/tmfile/op/tm2_pool.c
@@ -34,7 +34,6 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int pooling_op_map(int op)
 {
     return OP_POOL;
@@ -42,10 +41,10 @@ static int pooling_op_map(int op)
 
 static int tm2_load_pooling(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
-    struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_PoolParam* tm_param = ( TM2_PoolParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_PoolParam* tm_param = (TM2_PoolParam*)(mem_base + tm_op->offset_t_param);
 
     pool_param->kernel_h = tm_param->kernel_h;
     pool_param->kernel_w = tm_param->kernel_w;
@@ -69,7 +68,6 @@ static int tm2_load_pooling(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 int register_tm2_pool_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -85,7 +83,6 @@ int register_tm2_pool_op()
     return 0;
 }
 
-
 int unregister_tm2_pool_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_prelu.c b/source/serializer/tmfile/op/tm2_prelu.c
index c7c934e43..a43223b37 100644
--- a/source/serializer/tmfile/op/tm2_prelu.c
+++ b/source/serializer/tmfile/op/tm2_prelu.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int prelu_op_map(int op)
 {
     return OP_PRELU;
 }
 
-
 static int tm2_load_prelu(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                           const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_prelu_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_prelu_op()
     return 0;
 }
 
-
 int unregister_tm2_prelu_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_priorbox.c b/source/serializer/tmfile/op/tm2_priorbox.c
index 6328f2bd5..76e21f261 100644
--- a/source/serializer/tmfile/op/tm2_priorbox.c
+++ b/source/serializer/tmfile/op/tm2_priorbox.c
@@ -35,13 +35,11 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int priorbox_op_map(int op)
 {
     return OP_PRIORBOX;
 }
 
-
 static int tm2_load_priorbox(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
     struct priorbox_param* priorbox_param = (struct priorbox_param*)ir_node->op.param_mem;
@@ -86,10 +84,8 @@ static int tm2_load_priorbox(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 // TODO: add unload op
 
-
 int register_tm2_priorbox_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -105,7 +101,6 @@ int register_tm2_priorbox_op()
     return 0;
 }
 
-
 int unregister_tm2_priorbox_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_psroipooling.c b/source/serializer/tmfile/op/tm2_psroipooling.c
index ed86af268..6548e8319 100644
--- a/source/serializer/tmfile/op/tm2_psroipooling.c
+++ b/source/serializer/tmfile/op/tm2_psroipooling.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int psroipooling_op_map(int op)
 {
     return OP_PSROIPOOLING;
 }
 
-
 static int tm2_load_psroipooling(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                  const TM2_Operator* tm_op)
 {
-    struct psroipooling_param* psroipooling_param = ( struct psroipooling_param* )ir_node->op.param_mem;
+    struct psroipooling_param* psroipooling_param = (struct psroipooling_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_PsroipoolingParam* tm_param = ( TM2_PsroipoolingParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_PsroipoolingParam* tm_param = (TM2_PsroipoolingParam*)(mem_base + tm_op->offset_t_param);
 
     psroipooling_param->pooled_w = tm_param->pooled_w;
     psroipooling_param->pooled_h = tm_param->pooled_h;
@@ -57,7 +55,6 @@ static int tm2_load_psroipooling(struct graph* ir_graph, struct node* ir_node, c
     return 0;
 }
 
-
 int register_tm2_psroipooling_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -73,7 +70,6 @@ int register_tm2_psroipooling_op()
     return 0;
 }
 
-
 int unregister_tm2_psroipooling_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_reciprocal.c b/source/serializer/tmfile/op/tm2_reciprocal.c
index ff3da06df..384d70406 100644
--- a/source/serializer/tmfile/op/tm2_reciprocal.c
+++ b/source/serializer/tmfile/op/tm2_reciprocal.c
@@ -38,7 +38,7 @@ static int reciprocal_op_map(int op)
 }
 
 static int tm2_load_reciprocal(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
-                          const TM2_Operator* tm_op)
+                               const TM2_Operator* tm_op)
 {
     return 0;
 }
diff --git a/source/serializer/tmfile/op/tm2_reducel2.c b/source/serializer/tmfile/op/tm2_reducel2.c
index 5c75bf27c..942780629 100644
--- a/source/serializer/tmfile/op/tm2_reducel2.c
+++ b/source/serializer/tmfile/op/tm2_reducel2.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int reducel2_op_map(int op)
 {
     return OP_REDUCEL2;
 }
 
-
 static int tm2_load_reducel2(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                              const TM2_Operator* tm_op)
 {
-    struct reducel2_param* reducel2_param = ( struct reducel2_param* )ir_node->op.param_mem;
+    struct reducel2_param* reducel2_param = (struct reducel2_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ReduceL2Param* tm_param = ( TM2_ReduceL2Param* )(mem_base + tm_op->offset_t_param);
+    const TM2_ReduceL2Param* tm_param = (TM2_ReduceL2Param*)(mem_base + tm_op->offset_t_param);
 
     reducel2_param->axis = tm_param->axis;
     reducel2_param->keepdim = tm_param->keepdim;
@@ -55,7 +53,6 @@ static int tm2_load_reducel2(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 int register_tm2_reducel2_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_reducel2_op()
     return 0;
 }
 
-
 int unregister_tm2_reducel2_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_reduction.c b/source/serializer/tmfile/op/tm2_reduction.c
index a7d68cb67..e7b548003 100644
--- a/source/serializer/tmfile/op/tm2_reduction.c
+++ b/source/serializer/tmfile/op/tm2_reduction.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int reduction_op_map(int op)
 {
     return OP_REDUCTION;
 }
 
-
 static int tm2_load_reduction(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                               const TM2_Operator* tm_op)
 {
-    struct reduction_param* reduction_param = ( struct reduction_param* )ir_node->op.param_mem;
+    struct reduction_param* reduction_param = (struct reduction_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ReductionParam* tm_param = ( TM2_ReductionParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ReductionParam* tm_param = (TM2_ReductionParam*)(mem_base + tm_op->offset_t_param);
 
     reduction_param->dim_0 = tm_param->dim_0;
     reduction_param->dim_1 = tm_param->dim_1;
@@ -59,7 +57,6 @@ static int tm2_load_reduction(struct graph* ir_graph, struct node* ir_node, cons
     return 0;
 }
 
-
 int register_tm2_reduction_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -75,7 +72,6 @@ int register_tm2_reduction_op()
     return 0;
 }
 
-
 int unregister_tm2_reduction_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_region.c b/source/serializer/tmfile/op/tm2_region.c
index 0effa5d3b..15d55646e 100644
--- a/source/serializer/tmfile/op/tm2_region.c
+++ b/source/serializer/tmfile/op/tm2_region.c
@@ -35,20 +35,18 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int region_op_map(int op)
 {
     return OP_REGION;
 }
 
-
 static int tm2_load_region(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                            const TM2_Operator* tm_op)
 {
-    struct region_param* region_param = ( struct region_param* )ir_node->op.param_mem;
+    struct region_param* region_param = (struct region_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_RegionParam* tm_param = ( TM2_RegionParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_RegionParam* tm_param = (TM2_RegionParam*)(mem_base + tm_op->offset_t_param);
     const TM2_Vector_floats* v_biases = (TM2_Vector_floats*)(mem_base + tm_param->offset_vf_biases);
 
     region_param->num_classes = tm_param->num_classes;
@@ -67,7 +65,6 @@ static int tm2_load_region(struct graph* ir_graph, struct node* ir_node, const T
     return 0;
 }
 
-
 int register_tm2_region_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -83,7 +80,6 @@ int register_tm2_region_op()
     return 0;
 }
 
-
 int unregister_tm2_region_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_relu.c b/source/serializer/tmfile/op/tm2_relu.c
index 22282f3f9..d6fb24a57 100644
--- a/source/serializer/tmfile/op/tm2_relu.c
+++ b/source/serializer/tmfile/op/tm2_relu.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int relu_op_map(int op)
 {
     return OP_RELU;
 }
 
-
 static int tm2_load_relu(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
-    struct relu_param* relu_param = ( struct relu_param* )ir_node->op.param_mem;
+    struct relu_param* relu_param = (struct relu_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ReLuParam* tm_param = ( TM2_ReLuParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ReLuParam* tm_param = (TM2_ReLuParam*)(mem_base + tm_op->offset_t_param);
 
     relu_param->negative_slope = tm_param->negative_slope;
 
     return 0;
 }
 
-
 int register_tm2_relu_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_relu_op()
     return 0;
 }
 
-
 int unregister_tm2_relu_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_relu1.c b/source/serializer/tmfile/op/tm2_relu1.c
index cba2ea20c..732cfc8e2 100644
--- a/source/serializer/tmfile/op/tm2_relu1.c
+++ b/source/serializer/tmfile/op/tm2_relu1.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int relu1_op_map(int op)
 {
     return OP_RELU1;
 }
 
-
 static int tm2_load_relu1(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                           const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_relu1_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_relu1_op()
     return 0;
 }
 
-
 int unregister_tm2_relu1_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_relu6.c b/source/serializer/tmfile/op/tm2_relu6.c
index 46686be27..74faff826 100644
--- a/source/serializer/tmfile/op/tm2_relu6.c
+++ b/source/serializer/tmfile/op/tm2_relu6.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int relu6_op_map(int op)
 {
     return OP_RELU6;
 }
 
-
 static int tm2_load_relu6(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                           const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_relu6_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_relu6_op()
     return 0;
 }
 
-
 int unregister_tm2_relu6_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_reorg.c b/source/serializer/tmfile/op/tm2_reorg.c
index 02e801945..c28667bdc 100644
--- a/source/serializer/tmfile/op/tm2_reorg.c
+++ b/source/serializer/tmfile/op/tm2_reorg.c
@@ -34,26 +34,23 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int reorg_op_map(int op)
 {
     return OP_REORG;
 }
 
-
 static int tm2_load_reorg(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
-    struct reorg_param* reorg_param = ( struct reorg_param* )ir_node->op.param_mem;
+    struct reorg_param* reorg_param = (struct reorg_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ReorgParam* tm_param = ( TM2_ReorgParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ReorgParam* tm_param = (TM2_ReorgParam*)(mem_base + tm_op->offset_t_param);
 
     reorg_param->stride = tm_param->stride;
 
     return 0;
 }
 
-
 int register_tm2_reorg_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -69,7 +66,6 @@ int register_tm2_reorg_op()
     return 0;
 }
 
-
 int unregister_tm2_reorg_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_reshape.c b/source/serializer/tmfile/op/tm2_reshape.c
index e890cfad3..4be1bfe08 100644
--- a/source/serializer/tmfile/op/tm2_reshape.c
+++ b/source/serializer/tmfile/op/tm2_reshape.c
@@ -35,20 +35,18 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int reshape_op_map(int op)
 {
     return OP_RESHAPE;
 }
 
-
 static int tm2_load_reshape(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
-    struct reshape_param* param = ( struct reshape_param* )ir_node->op.param_mem;
+    struct reshape_param* param = (struct reshape_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ReshapeParam* tm_param = ( TM2_ReshapeParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ReshapeParam* tm_param = (TM2_ReshapeParam*)(mem_base + tm_op->offset_t_param);
     // set the reverse
     if (tm_param->reverse)
         param->reverse = true;
@@ -62,10 +60,10 @@ static int tm2_load_reshape(struct graph* ir_graph, struct node* ir_node, const
 
     if (tm_param->offset_re_shape != TM2_NOT_SET)
     {
-        const TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_re_shape);
+        const TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)(mem_base + tm_param->offset_re_shape);
         param->dim_size = v_re_shape->v_num;
 
-        param->re_shape = ( int* )sys_malloc(v_re_shape->v_num * sizeof(int));
+        param->re_shape = (int*)sys_malloc(v_re_shape->v_num * sizeof(int));
 
         for (unsigned int i = 0; i < v_re_shape->v_num; i++)
         {
@@ -76,7 +74,6 @@ static int tm2_load_reshape(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 int register_tm2_reshape_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -92,7 +89,6 @@ int register_tm2_reshape_op()
     return 0;
 }
 
-
 int unregister_tm2_reshape_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_resize.c b/source/serializer/tmfile/op/tm2_resize.c
index 4e7a2fe1f..75fb2c9b6 100644
--- a/source/serializer/tmfile/op/tm2_resize.c
+++ b/source/serializer/tmfile/op/tm2_resize.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int resize_op_map(int op)
 {
     return OP_RESIZE;
 }
 
-
 static int tm2_load_resize(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                            const TM2_Operator* tm_op)
 {
-    struct resize_param* resize_param = ( struct resize_param* )ir_node->op.param_mem;
+    struct resize_param* resize_param = (struct resize_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ResizeParam* tm_param = ( TM2_ResizeParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ResizeParam* tm_param = (TM2_ResizeParam*)(mem_base + tm_op->offset_t_param);
 
     resize_param->scale_h = tm_param->scale_x;
     resize_param->scale_w = tm_param->scale_y;
@@ -55,7 +53,6 @@ static int tm2_load_resize(struct graph* ir_graph, struct node* ir_node, const T
     return 0;
 }
 
-
 int register_tm2_resize_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_resize_op()
     return 0;
 }
 
-
 int unregister_tm2_resize_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_reverse.c b/source/serializer/tmfile/op/tm2_reverse.c
index 6107d44e1..ec37b4fd1 100644
--- a/source/serializer/tmfile/op/tm2_reverse.c
+++ b/source/serializer/tmfile/op/tm2_reverse.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int reverse_op_map(int op)
 {
     return OP_REVERSE;
 }
 
-
 static int tm2_load_reverse(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_reverse_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_reverse_op()
     return 0;
 }
 
-
 int unregister_tm2_reverse_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_rnn.c b/source/serializer/tmfile/op/tm2_rnn.c
index 82ee818ca..0c8a94c49 100644
--- a/source/serializer/tmfile/op/tm2_rnn.c
+++ b/source/serializer/tmfile/op/tm2_rnn.c
@@ -1,4 +1,4 @@
-  /*
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int rnn_op_map(int op)
 {
     return OP_RNN;
 }
 
-
 static int tm2_load_rnn(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
-                            const TM2_Operator* tm_op)
+                        const TM2_Operator* tm_op)
 {
-    struct rnn_param* rnn_param = (struct rnn_param* )ir_node->op.param_mem;
-    const struct tm2_priv* tm2_priv = (struct tm2_priv* )ir_graph->serializer_privacy;
+    struct rnn_param* rnn_param = (struct rnn_param*)ir_node->op.param_mem;
+    const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_RnnParam* tm_param = (TM2_RnnParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_RnnParam* tm_param = (TM2_RnnParam*)(mem_base + tm_op->offset_t_param);
 
     rnn_param->clip = tm_param->clip;
     rnn_param->output_len = tm_param->output_len;
@@ -62,12 +60,11 @@ static int tm2_load_rnn(struct graph* ir_graph, struct node* ir_node, const TM2_
     return 0;
 }
 
-
 int register_tm2_rnn_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
 
-    if(tm2_s == NULL)
+    if (tm2_s == NULL)
     {
         TLOG_ERR("tengine serializer has not been registered yet\n");
         return -1;
@@ -78,7 +75,6 @@ int register_tm2_rnn_op()
     return 0;
 }
 
-
 int unregister_tm2_rnn_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_roialign.c b/source/serializer/tmfile/op/tm2_roialign.c
index 746436fa9..44626fa7d 100644
--- a/source/serializer/tmfile/op/tm2_roialign.c
+++ b/source/serializer/tmfile/op/tm2_roialign.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int roialign_op_map(int op)
 {
     return OP_ROIALIGN;
 }
 
-
 static int tm2_load_roialign(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                              const TM2_Operator* tm_op)
 {
-    struct roialign_param* roialign_param = ( struct roialign_param* )ir_node->op.param_mem;
+    struct roialign_param* roialign_param = (struct roialign_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_RoialignParam* tm_param = ( TM2_RoialignParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_RoialignParam* tm_param = (TM2_RoialignParam*)(mem_base + tm_op->offset_t_param);
 
     roialign_param->pooled_width = tm_param->pooled_width;
     roialign_param->pooled_height = tm_param->pooled_height;
@@ -56,7 +54,6 @@ static int tm2_load_roialign(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 int register_tm2_roialign_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -72,7 +69,6 @@ int register_tm2_roialign_op()
     return 0;
 }
 
-
 int unregister_tm2_roialign_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_roipooling.c b/source/serializer/tmfile/op/tm2_roipooling.c
index 53d8ec007..b1d617f98 100644
--- a/source/serializer/tmfile/op/tm2_roipooling.c
+++ b/source/serializer/tmfile/op/tm2_roipooling.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int roi_pooling_op_map(int op)
 {
     return OP_ROIPOOLING;
 }
 
-
 static int tm2_load_roi_pooling(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                 const TM2_Operator* tm_op)
 {
-    struct roipooling_param* roi_pooling_param = ( struct roipooling_param* )ir_node->op.param_mem;
+    struct roipooling_param* roi_pooling_param = (struct roipooling_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ROIPoolingParam* tm_param = ( TM2_ROIPoolingParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ROIPoolingParam* tm_param = (TM2_ROIPoolingParam*)(mem_base + tm_op->offset_t_param);
 
     roi_pooling_param->pooled_h = tm_param->pooled_h;
     roi_pooling_param->pooled_w = tm_param->pooled_w;
@@ -56,7 +54,6 @@ static int tm2_load_roi_pooling(struct graph* ir_graph, struct node* ir_node, co
     return 0;
 }
 
-
 int register_tm2_roipooling_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -72,7 +69,6 @@ int register_tm2_roipooling_op()
     return 0;
 }
 
-
 int unregister_tm2_roipooling_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_round.c b/source/serializer/tmfile/op/tm2_round.c
index 757430933..f7bcebc58 100644
--- a/source/serializer/tmfile/op/tm2_round.c
+++ b/source/serializer/tmfile/op/tm2_round.c
@@ -32,24 +32,21 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int round_op_map(int op)
 {
     return OP_ROUND;
 }
 
-
 static int tm2_load_round(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_round_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
 
-    if(tm2_s == NULL)
+    if (tm2_s == NULL)
     {
         TLOG_ERR("tengine serializer has not been registered yet\n");
         return -1;
@@ -60,7 +57,6 @@ int register_tm2_round_op()
     return 0;
 }
 
-
 int unregister_tm2_round_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_rpn.c b/source/serializer/tmfile/op/tm2_rpn.c
index e43ec3f3b..a662076b9 100644
--- a/source/serializer/tmfile/op/tm2_rpn.c
+++ b/source/serializer/tmfile/op/tm2_rpn.c
@@ -35,20 +35,18 @@
 #include "utility/vector.h"
 #include "utility/log.h"
 
-
 static int rpn_op_map(int op)
 {
     return OP_RPN;
 }
 
-
 static int tm2_load_rpn(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                         const TM2_Operator* tm_op)
 {
-    struct rpn_param* rpn_param = ( struct rpn_param* )ir_node->op.param_mem;
+    struct rpn_param* rpn_param = (struct rpn_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_RPNParam* tm_param = ( TM2_RPNParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_RPNParam* tm_param = (TM2_RPNParam*)(mem_base + tm_op->offset_t_param);
 
     rpn_param->basesize = tm_param->basesize;
     rpn_param->feat_stride = tm_param->feat_stride;
@@ -65,7 +63,7 @@ static int tm2_load_rpn(struct graph* ir_graph, struct node* ir_node, const TM2_
 
         for (unsigned int i = 0; i < v_anchor_scales->v_num; i++)
         {
-            push_vector_data(rpn_param->anchor_scales, ( void* )&v_anchor_scales->data[i]);
+            push_vector_data(rpn_param->anchor_scales, (void*)&v_anchor_scales->data[i]);
         }
     }
 
@@ -77,14 +75,13 @@ static int tm2_load_rpn(struct graph* ir_graph, struct node* ir_node, const TM2_
 
         for (unsigned int i = 0; i < v_ratios->v_num; i++)
         {
-            push_vector_data(rpn_param->ratios, ( void* )&v_ratios->data[i]);
+            push_vector_data(rpn_param->ratios, (void*)&v_ratios->data[i]);
         }
     }
 
     return 0;
 }
 
-
 int register_tm2_rpn_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -100,7 +97,6 @@ int register_tm2_rpn_op()
     return 0;
 }
 
-
 int unregister_tm2_rpn_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_scale.c b/source/serializer/tmfile/op/tm2_scale.c
index 287a5001e..00d11b25f 100644
--- a/source/serializer/tmfile/op/tm2_scale.c
+++ b/source/serializer/tmfile/op/tm2_scale.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int scale_op_map(int op)
 {
     return OP_SCALE;
 }
 
-
 static int tm2_load_scale(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                           const TM2_Operator* tm_op)
 {
-    struct scale_param* scale_param = ( struct scale_param* )ir_node->op.param_mem;
+    struct scale_param* scale_param = (struct scale_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ScaleParam* tm_param = ( TM2_ScaleParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ScaleParam* tm_param = (TM2_ScaleParam*)(mem_base + tm_op->offset_t_param);
 
     scale_param->axis = tm_param->axis;
     scale_param->num_axes = tm_param->num_axes;
@@ -56,7 +54,6 @@ static int tm2_load_scale(struct graph* ir_graph, struct node* ir_node, const TM
     return 0;
 }
 
-
 int register_tm2_scale_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -72,7 +69,6 @@ int register_tm2_scale_op()
     return 0;
 }
 
-
 int unregister_tm2_scale_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_scatter.c b/source/serializer/tmfile/op/tm2_scatter.c
index a4e416c19..a7add086b 100644
--- a/source/serializer/tmfile/op/tm2_scatter.c
+++ b/source/serializer/tmfile/op/tm2_scatter.c
@@ -34,15 +34,13 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int scatter_op_map(int op)
 {
     return OP_SCATTER;
 }
 
-
 static int tm2_load_scatter(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
-                          const TM2_Operator* tm_op)
+                            const TM2_Operator* tm_op)
 {
     struct scatter_param* scatter_param = (struct scatter_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
@@ -54,7 +52,6 @@ static int tm2_load_scatter(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 int register_tm2_scatter_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_scatter_op()
     return 0;
 }
 
-
 int unregister_tm2_scatter_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_selu.c b/source/serializer/tmfile/op/tm2_selu.c
index 34f1e5f4a..daae951da 100644
--- a/source/serializer/tmfile/op/tm2_selu.c
+++ b/source/serializer/tmfile/op/tm2_selu.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int selu_op_map(int op)
 {
     return OP_SELU;
 }
 
-
 static int tm2_load_selu(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
-    struct selu_param* selu_param = ( struct selu_param* )ir_node->op.param_mem;
+    struct selu_param* selu_param = (struct selu_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_SeluParam* tm_param = ( TM2_SeluParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_SeluParam* tm_param = (TM2_SeluParam*)(mem_base + tm_op->offset_t_param);
 
     selu_param->alpha = tm_param->alpha;
     selu_param->lambda = tm_param->lambda;
@@ -55,7 +53,6 @@ static int tm2_load_selu(struct graph* ir_graph, struct node* ir_node, const TM2
     return 0;
 }
 
-
 int register_tm2_selu_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_selu_op()
     return 0;
 }
 
-
 int unregister_tm2_selu_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_shape.c b/source/serializer/tmfile/op/tm2_shape.c
index fc8394679..ae2770821 100644
--- a/source/serializer/tmfile/op/tm2_shape.c
+++ b/source/serializer/tmfile/op/tm2_shape.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int shape_op_map(int op)
 {
     return OP_SHAPE;
 }
 
-
 static int tm2_load_shape(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                           const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_shape_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_shape_op()
     return 0;
 }
 
-
 int unregister_tm2_shape_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_shuffle_channel.c b/source/serializer/tmfile/op/tm2_shuffle_channel.c
index f4e3086f4..b2323e3b7 100644
--- a/source/serializer/tmfile/op/tm2_shuffle_channel.c
+++ b/source/serializer/tmfile/op/tm2_shuffle_channel.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int shuffle_channel_op_map(int op)
 {
     return OP_SHUFFLECHANNEL;
 }
 
-
 static int tm2_load_shuffle_channel(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                     const TM2_Operator* tm_op)
 {
-    struct shuffle_channel_param* param = ( struct shuffle_channel_param* )ir_node->op.param_mem;
+    struct shuffle_channel_param* param = (struct shuffle_channel_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ShuffleChannelParam* tm_param = ( TM2_ShuffleChannelParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ShuffleChannelParam* tm_param = (TM2_ShuffleChannelParam*)(mem_base + tm_op->offset_t_param);
 
     param->group = tm_param->group;
 
     return 0;
 }
 
-
 int register_tm2_shuffle_channel_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_shuffle_channel_op()
     return 0;
 }
 
-
 int unregister_tm2_shuffle_channel_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_sigmoid.c b/source/serializer/tmfile/op/tm2_sigmoid.c
index 51709da66..6cd020db2 100644
--- a/source/serializer/tmfile/op/tm2_sigmoid.c
+++ b/source/serializer/tmfile/op/tm2_sigmoid.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int sigmoid_op_map(int op)
 {
     return OP_SIGMOID;
 }
 
-
 static int tm2_load_sigmoid(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_sigmoid_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_sigmoid_op()
     return 0;
 }
 
-
 int unregister_tm2_sigmoid_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_slice.c b/source/serializer/tmfile/op/tm2_slice.c
index e2c6c80a5..42a011a2c 100644
--- a/source/serializer/tmfile/op/tm2_slice.c
+++ b/source/serializer/tmfile/op/tm2_slice.c
@@ -35,17 +35,15 @@
 #include "utility/vector.h"
 #include "utility/log.h"
 
-
 static int slice_op_map(int op)
 {
     return OP_SLICE;
 }
 
-
 static int tm2_load_slice(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                           const TM2_Operator* tm_op)
 {
-    struct slice_param* slice_param = ( struct slice_param* )ir_node->op.param_mem;
+    struct slice_param* slice_param = (struct slice_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
     const TM2_SliceParam* tm_param = (TM2_SliceParam*)(mem_base + tm_op->offset_t_param);
@@ -64,35 +62,34 @@ static int tm2_load_slice(struct graph* ir_graph, struct node* ir_node, const TM
 
     if (tm_param->offset_vi_begins != TM2_NOT_SET)
     {
-        const TM2_Vector_indices* v_begins = ( TM2_Vector_indices* )(mem_base + tm_param->offset_vi_begins);
+        const TM2_Vector_indices* v_begins = (TM2_Vector_indices*)(mem_base + tm_param->offset_vi_begins);
         for (unsigned int i = 0; i < v_begins->v_num; i++)
         {
-            push_vector_data(slice_param->begin_, ( void* )&v_begins->indices[i]);
+            push_vector_data(slice_param->begin_, (void*)&v_begins->indices[i]);
         }
     }
 
     if (tm_param->offset_vi_sizes != TM2_NOT_SET)
     {
-        const TM2_Vector_indices* v_size = ( TM2_Vector_indices* )(mem_base + tm_param->offset_vi_sizes);
+        const TM2_Vector_indices* v_size = (TM2_Vector_indices*)(mem_base + tm_param->offset_vi_sizes);
         for (unsigned int i = 0; i < v_size->v_num; i++)
         {
-            push_vector_data(slice_param->size_, ( void* )&v_size->indices[i]);
+            push_vector_data(slice_param->size_, (void*)&v_size->indices[i]);
         }
     }
 
     if (tm_param->offset_vi_slice_points != TM2_NOT_SET)
     {
-        const TM2_Vector_indices* v_slice_point = ( TM2_Vector_indices* )(mem_base + tm_param->offset_vi_slice_points);
+        const TM2_Vector_indices* v_slice_point = (TM2_Vector_indices*)(mem_base + tm_param->offset_vi_slice_points);
         for (unsigned int i = 0; i < v_slice_point->v_num; i++)
         {
-            push_vector_data(slice_param->slice_point_, ( void* )&v_slice_point->indices[i]);
+            push_vector_data(slice_param->slice_point_, (void*)&v_slice_point->indices[i]);
         }
     }
 
     return 0;
 }
 
-
 int register_tm2_slice_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -108,7 +105,6 @@ int register_tm2_slice_op()
     return 0;
 }
 
-
 int unregister_tm2_slice_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_softmax.c b/source/serializer/tmfile/op/tm2_softmax.c
index 0eb832202..9cf340b07 100644
--- a/source/serializer/tmfile/op/tm2_softmax.c
+++ b/source/serializer/tmfile/op/tm2_softmax.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int softmax_op_map(int op)
 {
     return OP_SOFTMAX;
 }
 
-
 static int tm2_load_softmax(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
-    struct softmax_param* softmax_param = ( struct softmax_param* )ir_node->op.param_mem;
+    struct softmax_param* softmax_param = (struct softmax_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_SoftmaxParam* tm_param = ( TM2_SoftmaxParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_SoftmaxParam* tm_param = (TM2_SoftmaxParam*)(mem_base + tm_op->offset_t_param);
 
     softmax_param->axis = tm_param->axis;
 
     return 0;
 }
 
-
 int register_tm2_softmax_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_softmax_op()
     return 0;
 }
 
-
 int unregister_tm2_softmax_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_softplus.c b/source/serializer/tmfile/op/tm2_softplus.c
index 5e0f10f16..9f35786fa 100644
--- a/source/serializer/tmfile/op/tm2_softplus.c
+++ b/source/serializer/tmfile/op/tm2_softplus.c
@@ -38,7 +38,7 @@ static int softplus_op_map(int op)
 }
 
 static int tm2_load_softplus(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
-                          const TM2_Operator* tm_op)
+                             const TM2_Operator* tm_op)
 {
     return 0;
 }
diff --git a/source/serializer/tmfile/op/tm2_spacetobatchnd.c b/source/serializer/tmfile/op/tm2_spacetobatchnd.c
index 2163d2fee..7f6f5aa3e 100644
--- a/source/serializer/tmfile/op/tm2_spacetobatchnd.c
+++ b/source/serializer/tmfile/op/tm2_spacetobatchnd.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int spacetobatchnd_op_map(int op)
 {
     return OP_SPACETOBATCHND;
 }
 
-
 static int tm2_load_spacetobatchnd(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                    const TM2_Operator* tm_op)
 {
-    struct spacetobatchnd_param* spacetobatchnd_param = ( struct spacetobatchnd_param* )ir_node->op.param_mem;
+    struct spacetobatchnd_param* spacetobatchnd_param = (struct spacetobatchnd_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_SpaceToBatchNDParam* tm_param = ( TM2_SpaceToBatchNDParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_SpaceToBatchNDParam* tm_param = (TM2_SpaceToBatchNDParam*)(mem_base + tm_op->offset_t_param);
 
     spacetobatchnd_param->dilation_x = tm_param->dilation_x;
     spacetobatchnd_param->dilation_y = tm_param->dilation_y;
@@ -59,7 +57,6 @@ static int tm2_load_spacetobatchnd(struct graph* ir_graph, struct node* ir_node,
     return 0;
 }
 
-
 int register_tm2_spacetobatchnd_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -75,7 +72,6 @@ int register_tm2_spacetobatchnd_op()
     return 0;
 }
 
-
 int unregister_tm2_spacetobatchnd_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_spacetodepth.c b/source/serializer/tmfile/op/tm2_spacetodepth.c
index 5eae73a50..d305df490 100644
--- a/source/serializer/tmfile/op/tm2_spacetodepth.c
+++ b/source/serializer/tmfile/op/tm2_spacetodepth.c
@@ -32,19 +32,16 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int spacetodepth_op_map(int op)
 {
     return OP_SPACETODEPTH;
 }
 
-
 static int tm2_load_spacetodepth(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_spacetodepth_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -60,7 +57,6 @@ int register_tm2_spacetodepth_op()
     return 0;
 }
 
-
 int unregister_tm2_spacetodepth_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_sparsetodense.c b/source/serializer/tmfile/op/tm2_sparsetodense.c
index 0a73bac7b..e0504c69e 100644
--- a/source/serializer/tmfile/op/tm2_sparsetodense.c
+++ b/source/serializer/tmfile/op/tm2_sparsetodense.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int sparsetodense_op_map(int op)
 {
     return OP_SPARSETODENSE;
 }
 
-
 static int tm2_load_sparsetodense(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                   const TM2_Operator* tm_op)
 {
-    struct sparsetodense_param* sparsetodense_param = ( struct sparsetodense_param* )ir_node->op.param_mem;
+    struct sparsetodense_param* sparsetodense_param = (struct sparsetodense_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_SparseToDenseParam* tm_param = ( TM2_SparseToDenseParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_SparseToDenseParam* tm_param = (TM2_SparseToDenseParam*)(mem_base + tm_op->offset_t_param);
 
     sparsetodense_param->default_value = tm_param->default_value;
     sparsetodense_param->output_shape_size0 = tm_param->output_shape_size0;
@@ -56,7 +54,6 @@ static int tm2_load_sparsetodense(struct graph* ir_graph, struct node* ir_node,
     return 0;
 }
 
-
 int register_tm2_sparsetodense_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -72,7 +69,6 @@ int register_tm2_sparsetodense_op()
     return 0;
 }
 
-
 int unregister_tm2_sparsetodense_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_spatialtransformer.c b/source/serializer/tmfile/op/tm2_spatialtransformer.c
index f5ba0273b..4537ddb64 100644
--- a/source/serializer/tmfile/op/tm2_spatialtransformer.c
+++ b/source/serializer/tmfile/op/tm2_spatialtransformer.c
@@ -35,29 +35,27 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int spatialtransformer_op_map(int op)
 {
     return OP_SPATIALTRANSFORMER;
 }
 
-
 static int tm2_load_spatialtransformer(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
-                                  const TM2_Operator* tm_op)
+                                       const TM2_Operator* tm_op)
 {
-    struct spatialtransformer_param* param = ( struct spatialtransformer_param* )ir_node->op.param_mem;
+    struct spatialtransformer_param* param = (struct spatialtransformer_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_SpatialTransformerParam* tm_param = ( TM2_SpatialTransformerParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_SpatialTransformerParam* tm_param = (TM2_SpatialTransformerParam*)(mem_base + tm_op->offset_t_param);
 
     param->sampler_type = tm_param->sampler_type;
     param->transformer_type = tm_param->transformer_type;
     int index = 0;
     if (tm_param->offset_ta_shape != TM2_NOT_SET)
     {
-        const TM2_Vector_dims* v_ta_shape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_ta_shape);
+        const TM2_Vector_dims* v_ta_shape = (TM2_Vector_dims*)(mem_base + tm_param->offset_ta_shape);
 
-        param->target_shape = ( int* )sys_malloc(v_ta_shape->v_num * sizeof(int));
+        param->target_shape = (int*)sys_malloc(v_ta_shape->v_num * sizeof(int));
         for (unsigned int i = 0; i < v_ta_shape->v_num; i++)
         {
             param->target_shape[i] = v_ta_shape->dims[i];
@@ -66,7 +64,6 @@ static int tm2_load_spatialtransformer(struct graph* ir_graph, struct node* ir_n
     return 0;
 }
 
-
 int register_tm2_spatialtransformer_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -82,7 +79,6 @@ int register_tm2_spatialtransformer_op()
     return 0;
 }
 
-
 int unregister_tm2_spatialtransformer_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_split.c b/source/serializer/tmfile/op/tm2_split.c
index 850ecaebb..a96abe453 100644
--- a/source/serializer/tmfile/op/tm2_split.c
+++ b/source/serializer/tmfile/op/tm2_split.c
@@ -35,20 +35,18 @@
 #include "utility/vector.h"
 #include "utility/log.h"
 
-
 static int split_op_map(int op)
 {
     return OP_SPLIT;
 }
 
-
 static int tm2_load_split(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                           const TM2_Operator* tm_op)
 {
-    struct split_param* split_param = ( struct split_param* )ir_node->op.param_mem;
+    struct split_param* split_param = (struct split_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_SplitParam* tm_param = ( TM2_SplitParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_SplitParam* tm_param = (TM2_SplitParam*)(mem_base + tm_op->offset_t_param);
 
     if (tm_param->is_caffe)
         split_param->is_caffe = true;
@@ -67,13 +65,13 @@ static int tm2_load_split(struct graph* ir_graph, struct node* ir_node, const TM
         split_param->split_dim = tm_param->split_dim;
         if (tm_param->offset_split_sizes != TM2_NOT_SET)
         {
-            const TM2_Vector_dims* v_split_sizes = ( TM2_Vector_dims* )(mem_base + tm_param->offset_split_sizes);
+            const TM2_Vector_dims* v_split_sizes = (TM2_Vector_dims*)(mem_base + tm_param->offset_split_sizes);
             split_param->split_sizes_ = create_vector(sizeof(int), NULL);
 
             for (int i = 0; i < v_split_sizes->v_num; i++)
             {
                 int dim = v_split_sizes->dims[i];
-                push_vector_data(split_param->split_sizes_, ( void* )(&dim));
+                push_vector_data(split_param->split_sizes_, (void*)(&dim));
             }
         }
     }
@@ -81,7 +79,6 @@ static int tm2_load_split(struct graph* ir_graph, struct node* ir_node, const TM
     return 0;
 }
 
-
 int register_tm2_split_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -97,7 +94,6 @@ int register_tm2_split_op()
     return 0;
 }
 
-
 int unregister_tm2_split_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_squareddifference.c b/source/serializer/tmfile/op/tm2_squareddifference.c
index 8dbe175de..51f27c99f 100644
--- a/source/serializer/tmfile/op/tm2_squareddifference.c
+++ b/source/serializer/tmfile/op/tm2_squareddifference.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int squareddifference_op_map(int op)
 {
     return OP_SQUAREDDIFFERENCE;
 }
 
-
 static int tm2_load_squareddifference(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                       const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_squareddifference_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -62,7 +59,6 @@ int register_tm2_squareddifference_op()
     return 0;
 }
 
-
 int unregister_tm2_squareddifference_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_squeeze.c b/source/serializer/tmfile/op/tm2_squeeze.c
index a59f821e2..f3cec3b5b 100644
--- a/source/serializer/tmfile/op/tm2_squeeze.c
+++ b/source/serializer/tmfile/op/tm2_squeeze.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int squeeze_op_map(int op)
 {
     return OP_SQUEEZE;
 }
 
-
 static int tm2_load_squeeze(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                             const TM2_Operator* tm_op)
 {
-    struct squeeze_param* squeeze_param = ( struct squeeze_param* )ir_node->op.param_mem;
+    struct squeeze_param* squeeze_param = (struct squeeze_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_SqueezeParam* tm_param = ( TM2_SqueezeParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_SqueezeParam* tm_param = (TM2_SqueezeParam*)(mem_base + tm_op->offset_t_param);
 
     squeeze_param->dim_0 = tm_param->dim_0;
     squeeze_param->dim_1 = tm_param->dim_1;
@@ -57,7 +55,6 @@ static int tm2_load_squeeze(struct graph* ir_graph, struct node* ir_node, const
     return 0;
 }
 
-
 int register_tm2_squeeze_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -73,7 +70,6 @@ int register_tm2_squeeze_op()
     return 0;
 }
 
-
 int unregister_tm2_squeeze_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_strided_slice.c b/source/serializer/tmfile/op/tm2_strided_slice.c
index a11e8b8b7..4fbf03df2 100644
--- a/source/serializer/tmfile/op/tm2_strided_slice.c
+++ b/source/serializer/tmfile/op/tm2_strided_slice.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int strided_slice_op_map(int op)
 {
     return OP_STRIDED_SLICE;
 }
 
-
 static int tm2_load_strided_slice(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                                   const TM2_Operator* tm_op)
 {
-    struct strided_slice_param* strided_slice_param = ( struct strided_slice_param* )ir_node->op.param_mem;
+    struct strided_slice_param* strided_slice_param = (struct strided_slice_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_StridedSliceParam* tm_param = ( TM2_StridedSliceParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_StridedSliceParam* tm_param = (TM2_StridedSliceParam*)(mem_base + tm_op->offset_t_param);
 
     strided_slice_param->begin[0] = tm_param->begin_n;
     strided_slice_param->begin[1] = tm_param->begin_c;
@@ -65,7 +63,6 @@ static int tm2_load_strided_slice(struct graph* ir_graph, struct node* ir_node,
     return 0;
 }
 
-
 int register_tm2_strided_slice_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -81,7 +78,6 @@ int register_tm2_strided_slice_op()
     return 0;
 }
 
-
 int unregister_tm2_strided_slice_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_swap_axis.c b/source/serializer/tmfile/op/tm2_swap_axis.c
index 8860b30cf..3332e144c 100644
--- a/source/serializer/tmfile/op/tm2_swap_axis.c
+++ b/source/serializer/tmfile/op/tm2_swap_axis.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int swap_axis_op_map(int op)
 {
     return OP_SWAP_AXIS;
 }
 
-
 static int tm2_load_swap_axis(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                               const TM2_Operator* tm_op)
 {
-    struct swap_axis_param* swap_axis_param = ( struct swap_axis_param* )ir_node->op.param_mem;
+    struct swap_axis_param* swap_axis_param = (struct swap_axis_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_SwapAxisParam* tm_param = ( TM2_SwapAxisParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_SwapAxisParam* tm_param = (TM2_SwapAxisParam*)(mem_base + tm_op->offset_t_param);
 
     swap_axis_param->dim_0 = tm_param->dim_0;
     swap_axis_param->dim_1 = tm_param->dim_1;
@@ -55,7 +53,6 @@ static int tm2_load_swap_axis(struct graph* ir_graph, struct node* ir_node, cons
     return 0;
 }
 
-
 int register_tm2_swap_axis_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -71,7 +68,6 @@ int register_tm2_swap_axis_op()
     return 0;
 }
 
-
 int unregister_tm2_swap_axis_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_tanh.c b/source/serializer/tmfile/op/tm2_tanh.c
index 5428ef3d6..42e4b8a8f 100644
--- a/source/serializer/tmfile/op/tm2_tanh.c
+++ b/source/serializer/tmfile/op/tm2_tanh.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int tanh_op_map(int op)
 {
     return OP_TANH;
 }
 
-
 static int tm2_load_tanh(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                          const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_tanh_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_tanh_op()
     return 0;
 }
 
-
 int unregister_tm2_tanh_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_threshold.c b/source/serializer/tmfile/op/tm2_threshold.c
index 7c42ca455..a24b83050 100644
--- a/source/serializer/tmfile/op/tm2_threshold.c
+++ b/source/serializer/tmfile/op/tm2_threshold.c
@@ -34,26 +34,23 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int threshold_op_map(int op)
 {
     return OP_THRESHOLD;
 }
 
-
 static int tm2_load_threshold(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
-    struct threshold_param* param = ( struct threshold_param* )ir_node->op.param_mem;
+    struct threshold_param* param = (struct threshold_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_ThresholdParam* tm_param = ( TM2_ThresholdParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_ThresholdParam* tm_param = (TM2_ThresholdParam*)(mem_base + tm_op->offset_t_param);
 
     param->threshold = tm_param->threshold;
 
     return 0;
 }
 
-
 int register_tm2_threshold_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -69,7 +66,6 @@ int register_tm2_threshold_op()
     return 0;
 }
 
-
 int unregister_tm2_threshold_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_tile.c b/source/serializer/tmfile/op/tm2_tile.c
index 84cc7ca62..a128bc4f8 100644
--- a/source/serializer/tmfile/op/tm2_tile.c
+++ b/source/serializer/tmfile/op/tm2_tile.c
@@ -35,13 +35,11 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int tile_op_map(int op)
 {
     return OP_TILE;
 }
 
-
 static int tm2_load_tile(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
     struct tile_param* tile_param = (struct tile_param*)ir_node->op.param_mem;
@@ -51,21 +49,20 @@ static int tm2_load_tile(struct graph* ir_graph, struct node* ir_node, const TM2
     tile_param->frame_flag = tm_param->frame_flag;
     if (tm_param->offset_reps != TM2_NOT_SET)
     {
-        const TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_reps);
+        const TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)(mem_base + tm_param->offset_reps);
         tile_param->reps_size = v_re_shape->v_num;
 
-        tile_param->reps = ( int* )sys_malloc(v_re_shape->v_num * sizeof(int));
+        tile_param->reps = (int*)sys_malloc(v_re_shape->v_num * sizeof(int));
 
         for (unsigned int i = 0; i < v_re_shape->v_num; i++)
         {
             tile_param->reps[i] = v_re_shape->dims[i];
         }
-    }    
+    }
 
     return 0;
 }
 
-
 int register_tm2_tile_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -81,7 +78,6 @@ int register_tm2_tile_op()
     return 0;
 }
 
-
 int unregister_tm2_tile_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_topkv2.c b/source/serializer/tmfile/op/tm2_topkv2.c
index c2287949f..8ae0dda27 100644
--- a/source/serializer/tmfile/op/tm2_topkv2.c
+++ b/source/serializer/tmfile/op/tm2_topkv2.c
@@ -34,20 +34,18 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int topkv2_op_map(int op)
 {
     return OP_TOPKV2;
 }
 
-
 static int tm2_load_topkv2(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                            const TM2_Operator* tm_op)
 {
-    struct topkv2_param* topkv2_param = ( struct topkv2_param* )ir_node->op.param_mem;
+    struct topkv2_param* topkv2_param = (struct topkv2_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_TopKV2Param* tm_param = ( TM2_TopKV2Param* )(mem_base + tm_op->offset_t_param);
+    const TM2_TopKV2Param* tm_param = (TM2_TopKV2Param*)(mem_base + tm_op->offset_t_param);
 
     topkv2_param->k = tm_param->k;
     if (tm_param->sorted)
@@ -58,7 +56,6 @@ static int tm2_load_topkv2(struct graph* ir_graph, struct node* ir_node, const T
     return 0;
 }
 
-
 int register_tm2_topkv2_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -74,7 +71,6 @@ int register_tm2_topkv2_op()
     return 0;
 }
 
-
 int unregister_tm2_topkv2_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_transpose.c b/source/serializer/tmfile/op/tm2_transpose.c
index c97077317..da5fd4a7b 100644
--- a/source/serializer/tmfile/op/tm2_transpose.c
+++ b/source/serializer/tmfile/op/tm2_transpose.c
@@ -35,16 +35,14 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int transpose_op_map(int op)
 {
     return OP_TRANSPOSE;
 }
 
-
 static int tm2_load_transpose(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
-    struct transpose_param* transpose_param = ( struct transpose_param* )ir_node->op.param_mem;
+    struct transpose_param* transpose_param = (struct transpose_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
     const TM2_TransposeParam* tm_param = (TM2_TransposeParam*)(mem_base + tm_op->offset_t_param);
@@ -64,7 +62,6 @@ static int tm2_load_transpose(struct graph* ir_graph, struct node* ir_node, cons
     return 0;
 }
 
-
 int register_tm2_transpose_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -80,7 +77,6 @@ int register_tm2_transpose_op()
     return 0;
 }
 
-
 int unregister_tm2_transpose_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_unary.c b/source/serializer/tmfile/op/tm2_unary.c
index a25e0e121..aced83cf9 100644
--- a/source/serializer/tmfile/op/tm2_unary.c
+++ b/source/serializer/tmfile/op/tm2_unary.c
@@ -34,26 +34,23 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int unary_op_map(int op)
 {
     return OP_UNARY;
 }
 
-
 static int tm2_load_unary(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, const TM2_Operator* tm_op)
 {
-    struct unary_param* unary_param = ( struct unary_param* )ir_node->op.param_mem;
+    struct unary_param* unary_param = (struct unary_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_UnaryParam* tm_param = ( TM2_UnaryParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_UnaryParam* tm_param = (TM2_UnaryParam*)(mem_base + tm_op->offset_t_param);
 
     unary_param->type = tm_param->type;
 
     return 0;
 }
 
-
 int register_tm2_unary_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -69,7 +66,6 @@ int register_tm2_unary_op()
     return 0;
 }
 
-
 int unregister_tm2_unary_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_unsqueeze.c b/source/serializer/tmfile/op/tm2_unsqueeze.c
index 3e3fbb61f..3458bca7c 100644
--- a/source/serializer/tmfile/op/tm2_unsqueeze.c
+++ b/source/serializer/tmfile/op/tm2_unsqueeze.c
@@ -35,26 +35,24 @@
 #include "utility/sys_port.h"
 #include "utility/log.h"
 
-
 static int unsqueeze_op_map(int op)
 {
     return OP_UNSQUEEZE;
 }
 
-
 static int tm2_load_unsqueeze(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                               const TM2_Operator* tm_op)
 {
-    struct unsqueeze_param* unsqueeze_param = ( struct unsqueeze_param* )ir_node->op.param_mem;
+    struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_UnsqueezeParam* tm_param = ( TM2_UnsqueezeParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_UnsqueezeParam* tm_param = (TM2_UnsqueezeParam*)(mem_base + tm_op->offset_t_param);
 
     if (tm_param->offset_vi_axises != TM2_NOT_SET)
     {
-        const TM2_Vector_dims* v_axises = ( TM2_Vector_dims* )(mem_base + tm_param->offset_vi_axises);
+        const TM2_Vector_dims* v_axises = (TM2_Vector_dims*)(mem_base + tm_param->offset_vi_axises);
         unsqueeze_param->axises_size = v_axises->v_num;
-        unsqueeze_param->axises = ( int* )sys_malloc(v_axises->v_num * sizeof(int));
+        unsqueeze_param->axises = (int*)sys_malloc(v_axises->v_num * sizeof(int));
         for (unsigned int i = 0; i < v_axises->v_num; i++)
             unsqueeze_param->axises[i] = v_axises->dims[i];
     }
@@ -62,7 +60,6 @@ static int tm2_load_unsqueeze(struct graph* ir_graph, struct node* ir_node, cons
     return 0;
 }
 
-
 int register_tm2_unsqueeze_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -78,7 +75,6 @@ int register_tm2_unsqueeze_op()
     return 0;
 }
 
-
 int unregister_tm2_unsqueeze_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_upsample.c b/source/serializer/tmfile/op/tm2_upsample.c
index ef1418ce3..1edb7ec4a 100644
--- a/source/serializer/tmfile/op/tm2_upsample.c
+++ b/source/serializer/tmfile/op/tm2_upsample.c
@@ -34,27 +34,24 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int upsample_op_map(int op)
 {
     return OP_UPSAMPLE;
 }
 
-
 static int tm2_load_upsample(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                              const TM2_Operator* tm_op)
 {
-    struct upsample_param* upsample_param = ( struct upsample_param* )ir_node->op.param_mem;
+    struct upsample_param* upsample_param = (struct upsample_param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
     const char* mem_base = tm2_priv->base;
-    const TM2_UpsampleParam* tm_param = ( TM2_UpsampleParam* )(mem_base + tm_op->offset_t_param);
+    const TM2_UpsampleParam* tm_param = (TM2_UpsampleParam*)(mem_base + tm_op->offset_t_param);
 
     upsample_param->scale = tm_param->scale;
 
     return 0;
 }
 
-
 int register_tm2_upsample_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -70,7 +67,6 @@ int register_tm2_upsample_op()
     return 0;
 }
 
-
 int unregister_tm2_upsample_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_where.c b/source/serializer/tmfile/op/tm2_where.c
index 866895086..80d34e049 100644
--- a/source/serializer/tmfile/op/tm2_where.c
+++ b/source/serializer/tmfile/op/tm2_where.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int where_op_map(int op)
 {
     return OP_RELU1;
 }
 
-
 static int tm2_load_where(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                           const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_where_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_where_op()
     return 0;
 }
 
-
 int unregister_tm2_where_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/op/tm2_zeroslike.c b/source/serializer/tmfile/op/tm2_zeroslike.c
index e1e735644..830ce2707 100644
--- a/source/serializer/tmfile/op/tm2_zeroslike.c
+++ b/source/serializer/tmfile/op/tm2_zeroslike.c
@@ -32,20 +32,17 @@
 #include "device/device.h"
 #include "utility/log.h"
 
-
 static int zeroslike_op_map(int op)
 {
     return OP_ZEROSLIKE;
 }
 
-
 static int tm2_load_zeroslike(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
                               const TM2_Operator* tm_op)
 {
     return 0;
 }
 
-
 int register_tm2_zeroslike_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
@@ -61,7 +58,6 @@ int register_tm2_zeroslike_op()
     return 0;
 }
 
-
 int unregister_tm2_zeroslike_op()
 {
     struct serializer* tm2_s = find_serializer_via_name("tengine");
diff --git a/source/serializer/tmfile/tm2_format.h b/source/serializer/tmfile/tm2_format.h
index f303e4dee..670715f95 100644
--- a/source/serializer/tmfile/tm2_format.h
+++ b/source/serializer/tmfile/tm2_format.h
@@ -32,271 +32,271 @@
 extern "C" {
 #endif
 
-#define TM2_FILE_VER_MAIN       2
-#define TM2_FILE_VER_SUB        0
-#define TM2_FILE_VER_COMPILE    0
+#define TM2_FILE_VER_MAIN    2
+#define TM2_FILE_VER_SUB     0
+#define TM2_FILE_VER_COMPILE 0
 
-#define TM2_OP_VER              1
+#define TM2_OP_VER 1
 
-#define TM2_NOT_SET             0x00
+#define TM2_NOT_SET 0x00
 
 /* Type define */
-typedef uint32_t tm_uoffset_t;                  /* offset is 4-byte unsigned integer */
-typedef uint32_t tm_size_t;                     /* size is 4-byte unsigned integer */
-typedef uint8_t tm_bool_t;                      /* bool is 1-byte unsigned integer */
+typedef uint32_t tm_uoffset_t; /* offset is 4-byte unsigned integer */
+typedef uint32_t tm_size_t;    /* size is 4-byte unsigned integer */
+typedef uint8_t tm_bool_t;     /* bool is 1-byte unsigned integer */
 
 /* Operator strings */
-#define TM2_OPSTR_ACCURACY                      "Accuracy"
-#define TM2_OPSTR_BATCHNORMALIZATION            "BatchNormalization"
-#define TM2_OPSTR_BILINEARRESIZE                "Resize"
-#define TM2_OPSTR_CONCAT                        "Concat"
-#define TM2_OPSTR_CONST                         "Const"
-#define TM2_OPSTR_CONVOLUTION                   "Convolution"
-#define TM2_OPSTR_DECONVOLUTION                 "Deconvolution"
-#define TM2_OPSTR_DETECTIONOUTPUT               "DetectionOutput"
-#define TM2_OPSTR_DROPOUT                       "Dropout"
-#define TM2_OPSTR_ELTWISE                       "Eltwise"
-#define TM2_OPSTR_FLATTEN                       "Flatten"
-#define TM2_OPSTR_FULLYCONNECTED                "FullyConnected"
-#define TM2_OPSTR_INPUTOP                       "InputOp"
-#define TM2_OPSTR_LRN                           "LRN"
-#define TM2_OPSTR_NORMALIZE                     "Normalize"
-#define TM2_OPSTR_PERMUTE                       "Permute"
-#define TM2_OPSTR_POOLING                       "Pooling"
-#define TM2_OPSTR_PRELU                         "PReLU"
-#define TM2_OPSTR_PRIORBOX                      "PriorBox"
-#define TM2_OPSTR_REGION                        "Region"
-#define TM2_OPSTR_RELU                          "ReLu"
-#define TM2_OPSTR_RELU6                         "ReLu6"
-#define TM2_OPSTR_REORG                         "Reorg"
-#define TM2_OPSTR_RESHAPE                       "Reshape"
-#define TM2_OPSTR_ROIPOOLING                    "ROIPooling"
-#define TM2_OPSTR_RPN                           "RPN"
-#define TM2_OPSTR_SCALE                         "Scale"
-#define TM2_OPSTR_SLICE                         "Slice"
-#define TM2_OPSTR_SOFTMAX                       "Softmax"
-#define TM2_OPSTR_SPLIT                         "Split"
-#define TM2_OPSTR_DETECTIONPOSTPROCESS          "DetectionPostProcess"
-#define TM2_OPSTR_GEMM                          "Gemm"
-#define TM2_OPSTR_GENERIC                       "Generic"
-#define TM2_OPSTR_LOGISTIC                      "Logistic"
-#define TM2_OPSTR_LSTM                          "LSTM"
-#define TM2_OPSTR_RNN                           "RNN"
-#define TM2_OPSTR_TANH                          "Tanh"
-#define TM2_OPSTR_SIGMOID                       "Sigmoid"
-#define TM2_OPSTR_SQUEEZE                       "Squeeze"
-#define TM2_OPSTR_PAD                           "Pad"
-#define TM2_OPSTR_STRIDEDSLICE                  "StridedSlice"
-#define TM2_OPSTR_REDUCTION                     "Reduction"
-#define TM2_OPSTR_ARGMAX                        "ArgMax"
-#define TM2_OPSTR_ARGMIN                        "ArgMin"
-#define TM2_OPSTR_TOPKV2                        "TopKV2"
-#define TM2_OPSTR_MAX                           "Maximum"
-#define TM2_OPSTR_MIN                           "Minimum"
-#define TM2_OPSTR_ADDN                          "Addn"
-#define TM2_OPSTR_SWAPAXIS                      "SwapAxis"
-#define TM2_OPSTR_GRU                           "GRU"
-#define TM2_OPSTR_FUSEDBNSCALERELU              "Fused.BNScaleReLu"
-#define TM2_OPSTR_UPSAMPLE                      "Upsample"
-#define TM2_OPSTR_SHUFFLECHANNEL                "ShuffleChannel"
-#define TM2_OPSTR_RESIZE                        "Resize"
-#define TM2_OPSTR_SPACETOBATCHND                "SpaceToBatchND"
-#define TM2_OPSTR_BATCHTOSPACEND                "BatchToSpaceND"
-#define TM2_OPSTR_CROP                          "Crop"
-#define TM2_OPSTR_PSROIPOOLING                  "Psroipooling"
-#define TM2_OPSTR_ROIALIGN                      "Roialign"
-#define TM2_OPSTR_EXPANDDIMS                    "Expanddims"
-#define TM2_OPSTR_UNARY                         "Unary"
-#define TM2_OPSTR_BIAS                          "Bias"
-#define TM2_OPSTR_NOOP                          "Noop"
-#define TM2_OPSTR_THRESHOLD                     "Threshold"
-#define TM2_OPSTR_HARDSIGMOID                   "Hardsigmoid"
-#define TM2_OPSTR_EMBED                         "Embedding"
-#define TM2_OPSTR_INSTANCENORM                  "InstanceNorm"
-#define TM2_OPSTR_MVN                           "MVN"
-#define TM2_OPSTR_ABSVAL                        "Absval"
-#define TM2_OPSTR_CAST                          "Cast"
-#define TM2_OPSTR_HARDSWISH                     "HardSwish"
-#define TM2_OPSTR_INTERP                        "Interp"
-#define TM2_OPSTR_SELU                          "Selu"
-#define TM2_OPSTR_ELU                           "Elu"
-#define TM2_OPSTR_BROADMUL                      "BroadMul"
-#define TM2_OPSTR_LOGICAL                       "Logical"
-#define TM2_OPSTR_GATHER                        "Gather"
-#define TM2_OPSTR_TRANSPOSE                     "Transpose"
-#define TM2_OPSTR_REVERSE                       "Reverse"
-#define TM2_OPSTR_COMPARISON                    "Comparison"
-#define TM2_OPSTR_SPACETODEPTH                  "SpaceToDepth"
-#define TM2_OPSTR_DEPTHTOSPACE                  "DepthToSpace"
-#define TM2_OPSTR_SQUAREDDIFFERENCE             "SquaredDifference"
-#define TM2_OPSTR_SPARSETODENSE                 "SparseToDense"
-#define TM2_OPSTR_CEIL                          "Ceil"
-#define TM2_OPSTR_ROUND                         "Round"
-#define TM2_OPSTR_ZEROSLIKE                     "ZerosLike"
-#define TM2_OPSTR_CLIP                          "Clip"
-#define TM2_OPSTR_UNSQUEEZE                     "Unsqueeze"
-#define TM2_OPSTR_REDUCEL2                      "ReduceL2"
-#define TM2_OPSTR_MEAN                          "Mean"
-#define TM2_OPSTR_MATMUL                        "MatMul"
-#define TM2_OPSTR_MISH                          "Mish"
-#define TM2_OPSTR_L2NORMALIZATION               "L2Normalization"
-#define TM2_OPSTR_RELU1                         "ReLU1"
-#define TM2_OPSTR_SHAPE                         "Shape"
-#define TM2_OPSTR_LOGSOFTMAX                    "LogSoftmax"
-#define TM2_OPSTR_SCATTER                       "Scatter"
-#define TM2_OPSTR_TILE                          "Tile"
-#define TM2_OPSTR_L2POOL                        "L2Pool"
-#define TM2_OPSTR_SOFTPLUS 						"Softplus"
-#define TM2_OPSTR_RECIPROCAL 					"Reciprocal"
-#define TM2_OPSTR_SPATIALTRANSFORMER            "SpatialTransformer"
-#define TM2_OPSTR_EXPAND                        "Expand"
+#define TM2_OPSTR_ACCURACY             "Accuracy"
+#define TM2_OPSTR_BATCHNORMALIZATION   "BatchNormalization"
+#define TM2_OPSTR_BILINEARRESIZE       "Resize"
+#define TM2_OPSTR_CONCAT               "Concat"
+#define TM2_OPSTR_CONST                "Const"
+#define TM2_OPSTR_CONVOLUTION          "Convolution"
+#define TM2_OPSTR_DECONVOLUTION        "Deconvolution"
+#define TM2_OPSTR_DETECTIONOUTPUT      "DetectionOutput"
+#define TM2_OPSTR_DROPOUT              "Dropout"
+#define TM2_OPSTR_ELTWISE              "Eltwise"
+#define TM2_OPSTR_FLATTEN              "Flatten"
+#define TM2_OPSTR_FULLYCONNECTED       "FullyConnected"
+#define TM2_OPSTR_INPUTOP              "InputOp"
+#define TM2_OPSTR_LRN                  "LRN"
+#define TM2_OPSTR_NORMALIZE            "Normalize"
+#define TM2_OPSTR_PERMUTE              "Permute"
+#define TM2_OPSTR_POOLING              "Pooling"
+#define TM2_OPSTR_PRELU                "PReLU"
+#define TM2_OPSTR_PRIORBOX             "PriorBox"
+#define TM2_OPSTR_REGION               "Region"
+#define TM2_OPSTR_RELU                 "ReLu"
+#define TM2_OPSTR_RELU6                "ReLu6"
+#define TM2_OPSTR_REORG                "Reorg"
+#define TM2_OPSTR_RESHAPE              "Reshape"
+#define TM2_OPSTR_ROIPOOLING           "ROIPooling"
+#define TM2_OPSTR_RPN                  "RPN"
+#define TM2_OPSTR_SCALE                "Scale"
+#define TM2_OPSTR_SLICE                "Slice"
+#define TM2_OPSTR_SOFTMAX              "Softmax"
+#define TM2_OPSTR_SPLIT                "Split"
+#define TM2_OPSTR_DETECTIONPOSTPROCESS "DetectionPostProcess"
+#define TM2_OPSTR_GEMM                 "Gemm"
+#define TM2_OPSTR_GENERIC              "Generic"
+#define TM2_OPSTR_LOGISTIC             "Logistic"
+#define TM2_OPSTR_LSTM                 "LSTM"
+#define TM2_OPSTR_RNN                  "RNN"
+#define TM2_OPSTR_TANH                 "Tanh"
+#define TM2_OPSTR_SIGMOID              "Sigmoid"
+#define TM2_OPSTR_SQUEEZE              "Squeeze"
+#define TM2_OPSTR_PAD                  "Pad"
+#define TM2_OPSTR_STRIDEDSLICE         "StridedSlice"
+#define TM2_OPSTR_REDUCTION            "Reduction"
+#define TM2_OPSTR_ARGMAX               "ArgMax"
+#define TM2_OPSTR_ARGMIN               "ArgMin"
+#define TM2_OPSTR_TOPKV2               "TopKV2"
+#define TM2_OPSTR_MAX                  "Maximum"
+#define TM2_OPSTR_MIN                  "Minimum"
+#define TM2_OPSTR_ADDN                 "Addn"
+#define TM2_OPSTR_SWAPAXIS             "SwapAxis"
+#define TM2_OPSTR_GRU                  "GRU"
+#define TM2_OPSTR_FUSEDBNSCALERELU     "Fused.BNScaleReLu"
+#define TM2_OPSTR_UPSAMPLE             "Upsample"
+#define TM2_OPSTR_SHUFFLECHANNEL       "ShuffleChannel"
+#define TM2_OPSTR_RESIZE               "Resize"
+#define TM2_OPSTR_SPACETOBATCHND       "SpaceToBatchND"
+#define TM2_OPSTR_BATCHTOSPACEND       "BatchToSpaceND"
+#define TM2_OPSTR_CROP                 "Crop"
+#define TM2_OPSTR_PSROIPOOLING         "Psroipooling"
+#define TM2_OPSTR_ROIALIGN             "Roialign"
+#define TM2_OPSTR_EXPANDDIMS           "Expanddims"
+#define TM2_OPSTR_UNARY                "Unary"
+#define TM2_OPSTR_BIAS                 "Bias"
+#define TM2_OPSTR_NOOP                 "Noop"
+#define TM2_OPSTR_THRESHOLD            "Threshold"
+#define TM2_OPSTR_HARDSIGMOID          "Hardsigmoid"
+#define TM2_OPSTR_EMBED                "Embedding"
+#define TM2_OPSTR_INSTANCENORM         "InstanceNorm"
+#define TM2_OPSTR_MVN                  "MVN"
+#define TM2_OPSTR_ABSVAL               "Absval"
+#define TM2_OPSTR_CAST                 "Cast"
+#define TM2_OPSTR_HARDSWISH            "HardSwish"
+#define TM2_OPSTR_INTERP               "Interp"
+#define TM2_OPSTR_SELU                 "Selu"
+#define TM2_OPSTR_ELU                  "Elu"
+#define TM2_OPSTR_BROADMUL             "BroadMul"
+#define TM2_OPSTR_LOGICAL              "Logical"
+#define TM2_OPSTR_GATHER               "Gather"
+#define TM2_OPSTR_TRANSPOSE            "Transpose"
+#define TM2_OPSTR_REVERSE              "Reverse"
+#define TM2_OPSTR_COMPARISON           "Comparison"
+#define TM2_OPSTR_SPACETODEPTH         "SpaceToDepth"
+#define TM2_OPSTR_DEPTHTOSPACE         "DepthToSpace"
+#define TM2_OPSTR_SQUAREDDIFFERENCE    "SquaredDifference"
+#define TM2_OPSTR_SPARSETODENSE        "SparseToDense"
+#define TM2_OPSTR_CEIL                 "Ceil"
+#define TM2_OPSTR_ROUND                "Round"
+#define TM2_OPSTR_ZEROSLIKE            "ZerosLike"
+#define TM2_OPSTR_CLIP                 "Clip"
+#define TM2_OPSTR_UNSQUEEZE            "Unsqueeze"
+#define TM2_OPSTR_REDUCEL2             "ReduceL2"
+#define TM2_OPSTR_MEAN                 "Mean"
+#define TM2_OPSTR_MATMUL               "MatMul"
+#define TM2_OPSTR_MISH                 "Mish"
+#define TM2_OPSTR_L2NORMALIZATION      "L2Normalization"
+#define TM2_OPSTR_RELU1                "ReLU1"
+#define TM2_OPSTR_SHAPE                "Shape"
+#define TM2_OPSTR_LOGSOFTMAX           "LogSoftmax"
+#define TM2_OPSTR_SCATTER              "Scatter"
+#define TM2_OPSTR_TILE                 "Tile"
+#define TM2_OPSTR_L2POOL               "L2Pool"
+#define TM2_OPSTR_SOFTPLUS             "Softplus"
+#define TM2_OPSTR_RECIPROCAL           "Reciprocal"
+#define TM2_OPSTR_SPATIALTRANSFORMER   "SpatialTransformer"
+#define TM2_OPSTR_EXPAND               "Expand"
 /* Operator types */
-#define TM2_OPTYPE_ACCURACY                       0 /* No Param                 */
-#define TM2_OPTYPE_BATCHNORMALIZATION             1 /* TM2_BatchNormParam       */
-#define TM2_OPTYPE_BILINEARRESIZE                 2 /* TM2_ResizeParam          */
-#define TM2_OPTYPE_CONCAT                         3 /* TM2_ConcatParam          */
-#define TM2_OPTYPE_CONST                          4 /* No Param                 */
-#define TM2_OPTYPE_CONVOLUTION                    5 /* TM2_ConvParam            */
-#define TM2_OPTYPE_DECONVOLUTION                  6 /* TM2_DeconvParam          */
-#define TM2_OPTYPE_DETECTIONOUTPUT                7 /* TM2_DetectionOutputParam */
-#define TM2_OPTYPE_DROPOUT                        8 /* No Param                 */
-#define TM2_OPTYPE_ELTWISE                        9 /* TM2_EltwiseParam         */
-#define TM2_OPTYPE_FLATTEN                       10 /* TM2_FlattenParam         */
-#define TM2_OPTYPE_FULLYCONNECTED                11 /* TM2_FCParam              */
-#define TM2_OPTYPE_INPUTOP                       12 /* No Param                 */
-#define TM2_OPTYPE_LRN                           13 /* TM2_LRNParam             */
-#define TM2_OPTYPE_NORMALIZE                     14 /* TM2_NormalizeParam       */
-#define TM2_OPTYPE_PERMUTE                       15 /* TM2_PermuteParam         */
-#define TM2_OPTYPE_POOLING                       16 /* TM2_PoolParam            */
-#define TM2_OPTYPE_PRELU                         17 /* No Param                 */
-#define TM2_OPTYPE_PRIORBOX                      18 /* TM2_PriorBoxParam        */
-#define TM2_OPTYPE_REGION                        19 /* TM2_RegionParam          */
-#define TM2_OPTYPE_RELU                          20 /* TM2_ReLuParam            */
-#define TM2_OPTYPE_RELU6                         21 /* No Param                 */
-#define TM2_OPTYPE_REORG                         22 /* TM2_ReorgParam           */
-#define TM2_OPTYPE_RESHAPE                       23 /* TM2_ReshapeParam         */
-#define TM2_OPTYPE_ROIPOOLING                    24 /* TM2_ROIPoolingParam      */
-#define TM2_OPTYPE_RPN                           25 /* TM2_RPNParam             */
-#define TM2_OPTYPE_SCALE                         26 /* TM2_ScaleParam           */
-#define TM2_OPTYPE_SLICE                         27 /* TM2_SliceParam           */
-#define TM2_OPTYPE_SOFTMAX                       28 /* TM2_SoftmaxParam         */
-#define TM2_OPTYPE_SPLIT                         29 /* No Param                 */
-#define TM2_OPTYPE_DETECTIONPOSTPROCESS          30 /* TM2_DetectionPostProcessParam */
-#define TM2_OPTYPE_GEMM                          31 /* TM2_GemmParam            */
-#define TM2_OPTYPE_GENERIC                       32 /* TM2_GenericParam         */
-#define TM2_OPTYPE_LOGISTIC                      33 /* No Param                 */
-#define TM2_OPTYPE_LSTM                          34 /* TM2_LstmParam            */
-#define TM2_OPTYPE_RNN                           35 /* TM2_RnnParam             */
-#define TM2_OPTYPE_TANH                          36 /* No Param                 */
-#define TM2_OPTYPE_SIGMOID                       37 /* No Param                 */
-#define TM2_OPTYPE_SQUEEZE                       38 /* TM2_SqueezeParam         */
-#define TM2_OPTYPE_FUSEDBNSCALERELU              39 /* No Param                 */
-#define TM2_OPTYPE_PAD                           40 /* TM2_PadParam             */
-#define TM2_OPTYPE_STRIDEDSLICE                  41 /* TM2_StrideSliceParam     */
-#define TM2_OPTYPE_ARGMAX                        42 /* TM2_ArgmaxParam          */
-#define TM2_OPTYPE_ARGMIN                        43 /* TM2_ArgminParam          */
-#define TM2_OPTYPE_TOPKV2                        44 /* TM2_TopkV2Param          */
-#define TM2_OPTYPE_REDUCTION                     45 /* TM2_ReductionParam       */
-#define TM2_OPTYPE_MAX                           46 /* No Param                 */
-#define TM2_OPTYPE_MIN                           47 /* No Param                 */
-#define TM2_OPTYPE_GRU                           48 /* TM2_GruParam             */
-#define TM2_OPTYPE_ADDN                          49 /* TM2_AddNParam            */
-#define TM2_OPTYPE_SWAPAXIS                      50 /* TM2_SwapAixsParam        */
-#define TM2_OPTYPE_UPSAMPLE                      51 /* TM2_UpsampleParam        */
-#define TM2_OPTYPE_SPACETOBATCHND                52
-#define TM2_OPTYPE_BATCHTOSPACEND                53
-#define TM2_OPTYPE_RESIZE                        54
-#define TM2_OPTYPE_SHUFFLECHANNEL                55 /* TM2_ShuffleChannelPara   */
-#define TM2_OPTYPE_CROP                          56 /* TM2_CropParam            */
-#define TM2_OPTYPE_ROIALIGN                      57
-#define TM2_OPTYPE_PSROIPOOLING                  58
-#define TM2_OPTYPE_UNARY                         59
-#define TM2_OPTYPE_EXPANDDIMS                    60
-#define TM2_OPTYPE_BIAS                          61
-#define TM2_OPTYPE_NOOP                          62
-#define TM2_OPTYPE_THRESHOLD                     63
-#define TM2_OPTYPE_HARDSIGMOID                   64
-#define TM2_OPTYPE_EMBED                         65
-#define TM2_OPTYPE_INSTANCENORM                  66
-#define TM2_OPTYPE_MVN                           67
-#define TM2_OPTYPE_ABSVAL                        68
-#define TM2_OPTYPE_CAST                          69
-#define TM2_OPTYPE_HARDSWISH                     70
-#define TM2_OPTYPE_INTERP                        71
-#define TM2_OPTYPE_SELU                          72
-#define TM2_OPTYPE_ELU                           73
-#define TM2_OPTYPE_BROADMUL                      74
-#define TM2_OPTYPE_LOGICAL                       75
-#define TM2_OPTYPE_GATHER                        76
-#define TM2_OPTYPE_TRANSPOSE                     77
-#define TM2_OPTYPE_COMPARISON                    78
-#define TM2_OPTYPE_SPACETODEPTH                  79
-#define TM2_OPTYPE_DEPTHTOSPACE                  80
-#define TM2_OPTYPE_REVERSE                       81
-#define TM2_OPTYPE_SPARSETODENSE                 82
-#define TM2_OPTYPE_CEIL                          83
-#define TM2_OPTYPE_SQUAREDDIFFERENCE             84
-#define TM2_OPTYPE_ROUND                         85
-#define TM2_OPTYPE_ZEROSLIKE                     86
-#define TM2_OPTYPE_CLIP                          87
-#define TM2_OPTYPE_UNSQUEEZE                     88
-#define TM2_OPTYPE_REDUCEL2                      89
-#define TM2_OPTYPE_MEAN                          90
-#define TM2_OPTYPE_MATMUL                        91
-#define TM2_OPTYPE_EXPAND                        92
-#define TM2_OPTYPE_SCATTER                       93
-#define TM2_OPTYPE_SHAPE                         94
-#define TM2_OPTYPE_WHERE                         95
-#define TM2_OPTYPE_TILE                          96
-#define TM2_OPTYPE_MISH                          97
-#define TM2_OPTYPE_L2POOL                        98
-#define TM2_OPTYPE_LOGSOFTMAX                    99
-#define TM2_OPTYPE_RELU1                        100
-#define TM2_OPTYPE_L2NORMALIZATION              101
-#define TM2_OPTYPE_SOFTPLUS                     102
-#define TM2_OPTYPE_RECIPROCAL                   103
-#define TM2_OPTYPE_SPATIALTRANSFORMER           105
-#define TM2_OPTYPE_NUM                          106
+#define TM2_OPTYPE_ACCURACY             0  /* No Param                 */
+#define TM2_OPTYPE_BATCHNORMALIZATION   1  /* TM2_BatchNormParam       */
+#define TM2_OPTYPE_BILINEARRESIZE       2  /* TM2_ResizeParam          */
+#define TM2_OPTYPE_CONCAT               3  /* TM2_ConcatParam          */
+#define TM2_OPTYPE_CONST                4  /* No Param                 */
+#define TM2_OPTYPE_CONVOLUTION          5  /* TM2_ConvParam            */
+#define TM2_OPTYPE_DECONVOLUTION        6  /* TM2_DeconvParam          */
+#define TM2_OPTYPE_DETECTIONOUTPUT      7  /* TM2_DetectionOutputParam */
+#define TM2_OPTYPE_DROPOUT              8  /* No Param                 */
+#define TM2_OPTYPE_ELTWISE              9  /* TM2_EltwiseParam         */
+#define TM2_OPTYPE_FLATTEN              10 /* TM2_FlattenParam         */
+#define TM2_OPTYPE_FULLYCONNECTED       11 /* TM2_FCParam              */
+#define TM2_OPTYPE_INPUTOP              12 /* No Param                 */
+#define TM2_OPTYPE_LRN                  13 /* TM2_LRNParam             */
+#define TM2_OPTYPE_NORMALIZE            14 /* TM2_NormalizeParam       */
+#define TM2_OPTYPE_PERMUTE              15 /* TM2_PermuteParam         */
+#define TM2_OPTYPE_POOLING              16 /* TM2_PoolParam            */
+#define TM2_OPTYPE_PRELU                17 /* No Param                 */
+#define TM2_OPTYPE_PRIORBOX             18 /* TM2_PriorBoxParam        */
+#define TM2_OPTYPE_REGION               19 /* TM2_RegionParam          */
+#define TM2_OPTYPE_RELU                 20 /* TM2_ReLuParam            */
+#define TM2_OPTYPE_RELU6                21 /* No Param                 */
+#define TM2_OPTYPE_REORG                22 /* TM2_ReorgParam           */
+#define TM2_OPTYPE_RESHAPE              23 /* TM2_ReshapeParam         */
+#define TM2_OPTYPE_ROIPOOLING           24 /* TM2_ROIPoolingParam      */
+#define TM2_OPTYPE_RPN                  25 /* TM2_RPNParam             */
+#define TM2_OPTYPE_SCALE                26 /* TM2_ScaleParam           */
+#define TM2_OPTYPE_SLICE                27 /* TM2_SliceParam           */
+#define TM2_OPTYPE_SOFTMAX              28 /* TM2_SoftmaxParam         */
+#define TM2_OPTYPE_SPLIT                29 /* No Param                 */
+#define TM2_OPTYPE_DETECTIONPOSTPROCESS 30 /* TM2_DetectionPostProcessParam */
+#define TM2_OPTYPE_GEMM                 31 /* TM2_GemmParam            */
+#define TM2_OPTYPE_GENERIC              32 /* TM2_GenericParam         */
+#define TM2_OPTYPE_LOGISTIC             33 /* No Param                 */
+#define TM2_OPTYPE_LSTM                 34 /* TM2_LstmParam            */
+#define TM2_OPTYPE_RNN                  35 /* TM2_RnnParam             */
+#define TM2_OPTYPE_TANH                 36 /* No Param                 */
+#define TM2_OPTYPE_SIGMOID              37 /* No Param                 */
+#define TM2_OPTYPE_SQUEEZE              38 /* TM2_SqueezeParam         */
+#define TM2_OPTYPE_FUSEDBNSCALERELU     39 /* No Param                 */
+#define TM2_OPTYPE_PAD                  40 /* TM2_PadParam             */
+#define TM2_OPTYPE_STRIDEDSLICE         41 /* TM2_StrideSliceParam     */
+#define TM2_OPTYPE_ARGMAX               42 /* TM2_ArgmaxParam          */
+#define TM2_OPTYPE_ARGMIN               43 /* TM2_ArgminParam          */
+#define TM2_OPTYPE_TOPKV2               44 /* TM2_TopkV2Param          */
+#define TM2_OPTYPE_REDUCTION            45 /* TM2_ReductionParam       */
+#define TM2_OPTYPE_MAX                  46 /* No Param                 */
+#define TM2_OPTYPE_MIN                  47 /* No Param                 */
+#define TM2_OPTYPE_GRU                  48 /* TM2_GruParam             */
+#define TM2_OPTYPE_ADDN                 49 /* TM2_AddNParam            */
+#define TM2_OPTYPE_SWAPAXIS             50 /* TM2_SwapAixsParam        */
+#define TM2_OPTYPE_UPSAMPLE             51 /* TM2_UpsampleParam        */
+#define TM2_OPTYPE_SPACETOBATCHND       52
+#define TM2_OPTYPE_BATCHTOSPACEND       53
+#define TM2_OPTYPE_RESIZE               54
+#define TM2_OPTYPE_SHUFFLECHANNEL       55 /* TM2_ShuffleChannelPara   */
+#define TM2_OPTYPE_CROP                 56 /* TM2_CropParam            */
+#define TM2_OPTYPE_ROIALIGN             57
+#define TM2_OPTYPE_PSROIPOOLING         58
+#define TM2_OPTYPE_UNARY                59
+#define TM2_OPTYPE_EXPANDDIMS           60
+#define TM2_OPTYPE_BIAS                 61
+#define TM2_OPTYPE_NOOP                 62
+#define TM2_OPTYPE_THRESHOLD            63
+#define TM2_OPTYPE_HARDSIGMOID          64
+#define TM2_OPTYPE_EMBED                65
+#define TM2_OPTYPE_INSTANCENORM         66
+#define TM2_OPTYPE_MVN                  67
+#define TM2_OPTYPE_ABSVAL               68
+#define TM2_OPTYPE_CAST                 69
+#define TM2_OPTYPE_HARDSWISH            70
+#define TM2_OPTYPE_INTERP               71
+#define TM2_OPTYPE_SELU                 72
+#define TM2_OPTYPE_ELU                  73
+#define TM2_OPTYPE_BROADMUL             74
+#define TM2_OPTYPE_LOGICAL              75
+#define TM2_OPTYPE_GATHER               76
+#define TM2_OPTYPE_TRANSPOSE            77
+#define TM2_OPTYPE_COMPARISON           78
+#define TM2_OPTYPE_SPACETODEPTH         79
+#define TM2_OPTYPE_DEPTHTOSPACE         80
+#define TM2_OPTYPE_REVERSE              81
+#define TM2_OPTYPE_SPARSETODENSE        82
+#define TM2_OPTYPE_CEIL                 83
+#define TM2_OPTYPE_SQUAREDDIFFERENCE    84
+#define TM2_OPTYPE_ROUND                85
+#define TM2_OPTYPE_ZEROSLIKE            86
+#define TM2_OPTYPE_CLIP                 87
+#define TM2_OPTYPE_UNSQUEEZE            88
+#define TM2_OPTYPE_REDUCEL2             89
+#define TM2_OPTYPE_MEAN                 90
+#define TM2_OPTYPE_MATMUL               91
+#define TM2_OPTYPE_EXPAND               92
+#define TM2_OPTYPE_SCATTER              93
+#define TM2_OPTYPE_SHAPE                94
+#define TM2_OPTYPE_WHERE                95
+#define TM2_OPTYPE_TILE                 96
+#define TM2_OPTYPE_MISH                 97
+#define TM2_OPTYPE_L2POOL               98
+#define TM2_OPTYPE_LOGSOFTMAX           99
+#define TM2_OPTYPE_RELU1                100
+#define TM2_OPTYPE_L2NORMALIZATION      101
+#define TM2_OPTYPE_SOFTPLUS             102
+#define TM2_OPTYPE_RECIPROCAL           103
+#define TM2_OPTYPE_SPATIALTRANSFORMER   105
+#define TM2_OPTYPE_NUM                  106
 
 /* --------------------- -------- TM objects -------------------------------- */
 
 typedef struct
 {
-    uint16_t ver_main; /* main version of Tengine model file format */
-    uint16_t ver_sub; /* sub version of Tengine model file format */
-    uint16_t ver_compile; /* compile version of Tengine model file format */
+    uint16_t ver_main;        /* main version of Tengine model file format */
+    uint16_t ver_sub;         /* sub version of Tengine model file format */
+    uint16_t ver_compile;     /* compile version of Tengine model file format */
     tm_uoffset_t offset_root; /* offset of root table (TM2_Model) */
 } TM2_Header;
 
 /* Root table of Tengine model */
 typedef struct
 {
-    int32_t orig_format; /* format of original model */
-    int32_t sub_format; /* sub format for DLA model */
+    int32_t orig_format;              /* format of original model */
+    int32_t sub_format;               /* sub format for DLA model */
     tm_uoffset_t offset_vo_subgraphs; /* offset of TM2_Vector_offsets <offsets of subgraphs> */
-    tm_uoffset_t offset_s_mname; /* offset of string <model name> */
+    tm_uoffset_t offset_s_mname;      /* offset of string <model name> */
 } TM2_Model;
 
 /* Only 1 subgraph is supported currently */
 typedef struct
 {
-    uint32_t subgraph_id; /* subgraph id */
-    int32_t graph_layout; /* actual data layout */
-    int32_t model_layout; /* data layout of original model */
-    tm_uoffset_t offset_vi_input_indices; /* offset of TM2_Vector_indices <indices of input nodes> */
+    uint32_t subgraph_id;                  /* subgraph id */
+    int32_t graph_layout;                  /* actual data layout */
+    int32_t model_layout;                  /* data layout of original model */
+    tm_uoffset_t offset_vi_input_indices;  /* offset of TM2_Vector_indices <indices of input nodes> */
     tm_uoffset_t offset_vi_output_indices; /* offset of TM2_Vector_indices <indices of output nodes> */
-    tm_uoffset_t offset_vo_seq_nodes; /* offset of TM2_Vector_offsets <nodes> */
-    tm_uoffset_t offset_vo_tensors; /* offset of TM2_Vector_offsets <tensors> */
-    tm_uoffset_t offset_vo_buffers; /* offset of TM2_Vector_offsets <buffers> */
-    tm_uoffset_t offset_s_sname; /* offset of string <subgraph name> */
-    tm_uoffset_t offset_vo_sub_info; /* offset of TM2_Vector_offsets <sub graph infomation> */
+    tm_uoffset_t offset_vo_seq_nodes;      /* offset of TM2_Vector_offsets <nodes> */
+    tm_uoffset_t offset_vo_tensors;        /* offset of TM2_Vector_offsets <tensors> */
+    tm_uoffset_t offset_vo_buffers;        /* offset of TM2_Vector_offsets <buffers> */
+    tm_uoffset_t offset_s_sname;           /* offset of string <subgraph name> */
+    tm_uoffset_t offset_vo_sub_info;       /* offset of TM2_Vector_offsets <sub graph infomation> */
 } TM2_Subgraph;
 
 typedef struct
 {
-    uint32_t subgraph_id; /* sub graph idx */
-    uint32_t input_wait_count; /* input wait count */
-    int32_t data_type;         /* FP32 FP16 U8 INT8 */
+    uint32_t subgraph_id;                 /* sub graph idx */
+    uint32_t input_wait_count;            /* input wait count */
+    int32_t data_type;                    /* FP32 FP16 U8 INT8 */
     tm_uoffset_t offset_vi_node_list;     /* offset of TM2_Vector_indices <indices of node list> */
     tm_uoffset_t offset_vi_input_tensor;  /* offset of TM2_Vector_indices <indices of input node> */
     tm_uoffset_t offset_vi_output_tensor; /* offset of TM2_Vector_indices <indices of output node> */
@@ -306,25 +306,25 @@ typedef struct
 typedef struct
 {
     tm_uoffset_t offset_s_attrname; /* offset of string <attr name> */
-    tm_uoffset_t offset_s_attrval; /* offset of string <attr value> */
+    tm_uoffset_t offset_s_attrval;  /* offset of string <attr value> */
     int32_t attr_type;
 } TM2_Attr;
 
 typedef struct
 {
-    uint32_t node_id; /* node id */
-    tm_uoffset_t offset_vi_input_tensors; /* offset of TM2_Vector_indices <indices of input tensors> */
+    uint32_t node_id;                      /* node id */
+    tm_uoffset_t offset_vi_input_tensors;  /* offset of TM2_Vector_indices <indices of input tensors> */
     tm_uoffset_t offset_vi_output_tensors; /* offset of TM2_Vector_indices <indices of output tensors> */
-    tm_uoffset_t offset_t_operator; /* offset of table  <operator> */
-    tm_uoffset_t offset_s_nname; /* offset of string <node name> */
-    tm_uoffset_t offset_vo_attrs; /* offset of TM2_Vector_offsets <attrs> */
+    tm_uoffset_t offset_t_operator;        /* offset of table  <operator> */
+    tm_uoffset_t offset_s_nname;           /* offset of string <node name> */
+    tm_uoffset_t offset_vo_attrs;          /* offset of TM2_Vector_offsets <attrs> */
     tm_bool_t dynamic_shape;
 } TM2_Node;
 
 typedef struct
 {
-    uint32_t op_ver; /* version of operator */
-    uint32_t operator_type; /* operator type */
+    uint32_t op_ver;             /* version of operator */
+    uint32_t operator_type;      /* operator type */
     tm_uoffset_t offset_t_param; /* offset of table <operator param> */
 } TM2_Operator;
 
@@ -339,8 +339,8 @@ typedef struct
 {
     uint32_t tensor_id;
     uint32_t buffer_id;
-    tm_uoffset_t offset_vd_dims; /* offset of TM2_Vector_dims <dims> */
-    tm_uoffset_t offset_s_tname; /* offset of string <tensor name> */
+    tm_uoffset_t offset_vd_dims;        /* offset of TM2_Vector_dims <dims> */
+    tm_uoffset_t offset_s_tname;        /* offset of string <tensor name> */
     tm_uoffset_t offect_vo_quantparams; /* offset of TM2_Vector_offsets <quant params> */
     int32_t layout;
     int32_t type;
@@ -349,13 +349,13 @@ typedef struct
 
 typedef struct
 {
-    tm_size_t size; /* buffer size */
+    tm_size_t size;           /* buffer size */
     tm_uoffset_t offset_data; /* offset of buffer data */
 } TM2_Buffer;
 
 typedef struct
 {
-    tm_size_t size; /* string size */
+    tm_size_t size;           /* string size */
     tm_uoffset_t offset_data; /* offset of string data */
 } TM2_String;
 
@@ -387,7 +387,7 @@ typedef struct
 
 typedef struct
 {
-    tm_size_t v_num; /* number of vector elements */
+    tm_size_t v_num;  /* number of vector elements */
     float data[0][4]; /* x0, y0, x1, y1 */
 } TM2_Vector_anchors;
 
@@ -521,9 +521,9 @@ typedef struct
 
 typedef struct
 {
-    tm_uoffset_t offset_vf_min_size; /* offset of TM2_Vector_floats <min_sizes> */
-    tm_uoffset_t offset_vf_max_size; /* offset of TM2_Vector_floats <max_sizes> */
-    tm_uoffset_t offset_vf_variance; /* offset of TM2_Vector_floats <variances> */
+    tm_uoffset_t offset_vf_min_size;     /* offset of TM2_Vector_floats <min_sizes> */
+    tm_uoffset_t offset_vf_max_size;     /* offset of TM2_Vector_floats <max_sizes> */
+    tm_uoffset_t offset_vf_variance;     /* offset of TM2_Vector_floats <variances> */
     tm_uoffset_t offset_vf_aspect_ratio; /* offset of TM2_Vector_floats <aspect_ratios> */
     int32_t flip;
     int32_t clip;
@@ -581,7 +581,7 @@ typedef struct
 
 typedef struct
 {
-    tm_uoffset_t offset_vf_ratios; /* pointer to TM2_Vector_floats <ratios> */
+    tm_uoffset_t offset_vf_ratios;        /* pointer to TM2_Vector_floats <ratios> */
     tm_uoffset_t offset_vf_anchor_scales; /* pointer to TM2_Vector_floats <anchor_scales> */
     int32_t feat_stride;
     int32_t basesize;
@@ -603,8 +603,8 @@ typedef struct
 {
     int32_t axis;
     tm_uoffset_t offset_vi_slice_points; /* offset of TM2_Vector_dims <slice_points> */
-    tm_uoffset_t offset_vi_begins; /* offset of TM2_Vector_dims <begins> */
-    tm_uoffset_t offset_vi_sizes; /* offset of TM2_Vector_dims <sizes> */
+    tm_uoffset_t offset_vi_begins;       /* offset of TM2_Vector_dims <begins> */
+    tm_uoffset_t offset_vi_sizes;        /* offset of TM2_Vector_dims <sizes> */
     int32_t iscaffe;
     int32_t ismxnet;
     int32_t isonnx;
@@ -892,7 +892,7 @@ typedef struct
 
 typedef struct
 {
-    int32_t resize_type;    // 1=nearest  2=bilinear  3=bicubic
+    int32_t resize_type; // 1=nearest  2=bilinear  3=bicubic
     float width_scale;
     float height_scale;
     int32_t output_width;
@@ -990,20 +990,19 @@ typedef struct
     tm_uoffset_t offset_reps;
 } TM2_TileParam;
 
-typedef struct 
+typedef struct
 {
     int sampler_type;
     int transformer_type;
     int shape_size;
     tm_uoffset_t offset_ta_shape;
-}TM2_SpatialTransformerParam;
+} TM2_SpatialTransformerParam;
 
-typedef struct 
+typedef struct
 {
     tm_uoffset_t offset_ex_shape;
     int dim_num;
-}TM2_ExpandParam;
-
+} TM2_ExpandParam;
 
 #ifdef __cplusplus
 }
diff --git a/source/serializer/tmfile/tm2_serializer.c b/source/serializer/tmfile/tm2_serializer.c
index b5d6f5953..3fc87d660 100644
--- a/source/serializer/tmfile/tm2_serializer.c
+++ b/source/serializer/tmfile/tm2_serializer.c
@@ -51,7 +51,6 @@
 
 #include <string.h>
 
-
 struct op_loader_entry
 {
     int op_type;
@@ -83,18 +82,18 @@ static char* strdup_name(char* buf, int size)
 
 static inline const TM2_Header* get_tm_file_header(const char* base)
 {
-    return ( const TM2_Header* )(base);
+    return (const TM2_Header*)(base);
 }
 
 static inline const TM2_Model* get_tm_file_model(const char* base, const TM2_Header* header)
 {
-    return ( const TM2_Model* )(base + header->offset_root);
+    return (const TM2_Model*)(base + header->offset_root);
 }
 
 static inline const TM2_Subgraph* get_tm_file_subgraph(const char* base, const TM2_Model* model)
 {
-    const TM2_Vector_offsets* v_graphs = ( TM2_Vector_offsets* )(base + model->offset_vo_subgraphs);
-    const TM2_Subgraph* tm_graph = ( TM2_Subgraph* )(base + v_graphs->offsets[0]);
+    const TM2_Vector_offsets* v_graphs = (TM2_Vector_offsets*)(base + model->offset_vo_subgraphs);
+    const TM2_Subgraph* tm_graph = (TM2_Subgraph*)(base + v_graphs->offsets[0]);
 
     return tm_graph;
 }
@@ -105,7 +104,7 @@ static struct op_loader_entry* find_op_loader(struct tm2_serializer* s, int op_t
 
     for (int i = 0; i < loader_num; i++)
     {
-        struct op_loader_entry* e = ( struct op_loader_entry* )get_vector_data(s->loader_list, i);
+        struct op_loader_entry* e = (struct op_loader_entry*)get_vector_data(s->loader_list, i);
 
         if (e->op_type == op_type)
             return e;
@@ -143,7 +142,7 @@ static int unregister_tm2_op_loader(struct tm2_serializer* s, int op_type, int o
 
     for (int i = 0; i < n; i++)
     {
-        struct op_loader_entry* e = ( struct op_loader_entry* )get_vector_data(s->loader_list, i);
+        struct op_loader_entry* e = (struct op_loader_entry*)get_vector_data(s->loader_list, i);
 
         if (e->op_type == op_type && e->loader == op_loader)
         {
@@ -157,11 +156,11 @@ static int unregister_tm2_op_loader(struct tm2_serializer* s, int op_type, int o
 
 static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph, struct tm2_priv* priv)
 {
-    char* mem_base = ( char* )priv->base;
+    char* mem_base = (char*)priv->base;
     const TM2_Subgraph* tm_graph = priv->subgraph;
 
-    const TM2_Vector_offsets* v_tensors = ( TM2_Vector_offsets* )(mem_base + tm_graph->offset_vo_tensors);
-    const TM2_Vector_offsets* v_buffers = ( TM2_Vector_offsets* )(mem_base + tm_graph->offset_vo_buffers);
+    const TM2_Vector_offsets* v_tensors = (TM2_Vector_offsets*)(mem_base + tm_graph->offset_vo_tensors);
+    const TM2_Vector_offsets* v_buffers = (TM2_Vector_offsets*)(mem_base + tm_graph->offset_vo_buffers);
 
     graph->graph_layout = tm_graph->graph_layout;
     graph->model_layout = tm_graph->model_layout;
@@ -175,8 +174,8 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph,
 
     for (int i = 0; i < v_tensors->v_num; i++)
     {
-        const TM2_Tensor* tm_tensor = ( TM2_Tensor* )(mem_base + v_tensors->offsets[i]);
-        int flag_permute = 0;    // flag the tensor has to be permute
+        const TM2_Tensor* tm_tensor = (TM2_Tensor*)(mem_base + v_tensors->offsets[i]);
+        int flag_permute = 0; // flag the tensor has to be permute
         int dims_org[8] = {0};
 
         /* TODO: check type definition */
@@ -193,14 +192,14 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph,
         if (tm_tensor->offset_s_tname != TM2_NOT_SET)
         {
             // TODO: using update the TM2 model
-            const TM2_String* tm_str = ( TM2_String* )(mem_base + tm_tensor->offset_s_tname);
+            const TM2_String* tm_str = (TM2_String*)(mem_base + tm_tensor->offset_s_tname);
             ir_tensor->name = strdup_name(mem_base + tm_str->offset_data, tm_str->size);
         }
 
         /* shape */
         if (tm_tensor->offset_vd_dims != TM2_NOT_SET)
         {
-            const TM2_Vector_dims* v_dims = ( TM2_Vector_dims* )(mem_base + tm_tensor->offset_vd_dims);
+            const TM2_Vector_dims* v_dims = (TM2_Vector_dims*)(mem_base + tm_tensor->offset_vd_dims);
 
             if (tm_graph->model_layout == TENGINE_LAYOUT_NCHW)
             {
@@ -217,10 +216,10 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph,
                     dims_org[2] = v_dims->dims[2];
                     dims_org[3] = v_dims->dims[3];
 
-                    dims[0] = v_dims->dims[0];    // c_out
-                    dims[1] = v_dims->dims[3];    // c_in
-                    dims[2] = v_dims->dims[1];    // h
-                    dims[3] = v_dims->dims[2];    // w
+                    dims[0] = v_dims->dims[0]; // c_out
+                    dims[1] = v_dims->dims[3]; // c_in
+                    dims[2] = v_dims->dims[1]; // h
+                    dims[3] = v_dims->dims[2]; // w
 
                     set_ir_tensor_shape(ir_tensor, dims, v_dims->v_num);
 
@@ -236,7 +235,7 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph,
         /* load const type of tensor, such as the weight or bias for convolution node */
         if (ir_tensor->tensor_type == TENSOR_TYPE_CONST)
         {
-            const TM2_Buffer* tm_buf = ( TM2_Buffer* )(mem_base + v_buffers->offsets[tm_tensor->buffer_id]);
+            const TM2_Buffer* tm_buf = (TM2_Buffer*)(mem_base + v_buffers->offsets[tm_tensor->buffer_id]);
 
             /* fill temp data buffer to benchmark */
             if (tm_buf->offset_data == TM2_NOT_SET)
@@ -348,7 +347,7 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph,
 
                     if (type == TENGINE_DT_UINT8 || type == TENGINE_DT_INT8)
                     {
-                        unsigned char* tensor_data_org = ( unsigned char* )sys_malloc(size * sizeof(unsigned char));
+                        unsigned char* tensor_data_org = (unsigned char*)sys_malloc(size * sizeof(unsigned char));
                         unsigned char* original_date = (unsigned char*)ir_tensor->data;
 
                         for (int n = 0; n < size; n++)
@@ -363,9 +362,9 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph,
                         dims[3] = ir_tensor->dims[3];
 
                         /* nhwc to nchw */
-//                        fprintf(stderr, "%s:\n", ir_tensor->name);
-//                        fprintf(stderr, "original %d, %d, %d, %d\n", dims_org[0], dims_org[1], dims_org[2], dims_org[3]);
-//                        fprintf(stderr, "permute  %d, %d, %d, %d\n", dims[0], dims[1], dims[2], dims[3]);
+                        //                        fprintf(stderr, "%s:\n", ir_tensor->name);
+                        //                        fprintf(stderr, "original %d, %d, %d, %d\n", dims_org[0], dims_org[1], dims_org[2], dims_org[3]);
+                        //                        fprintf(stderr, "permute  %d, %d, %d, %d\n", dims[0], dims[1], dims[2], dims[3]);
 
                         unsigned char* input = tensor_data_org;
                         unsigned char* output = (unsigned char*)ir_tensor->data;
@@ -436,28 +435,27 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph,
         /* load vector type of tensor */
         if (tm_tensor->offect_vo_quantparams != TM2_NOT_SET)
         {
-            const TM2_Vector_offsets* v_quantparams =
-                    ( TM2_Vector_offsets* )(mem_base + tm_tensor->offect_vo_quantparams);
+            const TM2_Vector_offsets* v_quantparams = (TM2_Vector_offsets*)(mem_base + tm_tensor->offect_vo_quantparams);
 
             /* currently only support one quant param */
             ir_tensor->quant_param_num = v_quantparams->v_num;
             if (v_quantparams->v_num == 1)
             {
-                const TM2_QuantParam* tm_qtparam = ( TM2_QuantParam* )(mem_base + v_quantparams->offsets[0]);
+                const TM2_QuantParam* tm_qtparam = (TM2_QuantParam*)(mem_base + v_quantparams->offsets[0]);
                 ir_tensor->scale = tm_qtparam->scale;
                 ir_tensor->zero_point = tm_qtparam->zero_point;
 
-//                printf("name %s, scale %f, zero %d\n", ir_tensor->name, ir_tensor->scale, ir_tensor->zero_point);
+                //                printf("name %s, scale %f, zero %d\n", ir_tensor->name, ir_tensor->scale, ir_tensor->zero_point);
             }
             else if (v_quantparams->v_num > 1)
             {
                 // to do : need to be updated
-                ir_tensor->scale_list = ( float* )sys_malloc(sizeof(float) * v_quantparams->v_num);
-                ir_tensor->zp_list = ( int* )sys_malloc(sizeof(int) * v_quantparams->v_num);
+                ir_tensor->scale_list = (float*)sys_malloc(sizeof(float) * v_quantparams->v_num);
+                ir_tensor->zp_list = (int*)sys_malloc(sizeof(int) * v_quantparams->v_num);
 
                 for (int j = 0; j < v_quantparams->v_num; j++)
                 {
-                    const TM2_QuantParam* tm_qtparam = ( TM2_QuantParam* )(mem_base + v_quantparams->offsets[j]);
+                    const TM2_QuantParam* tm_qtparam = (TM2_QuantParam*)(mem_base + v_quantparams->offsets[j]);
                     ir_tensor->scale_list[j] = tm_qtparam->scale;
                     ir_tensor->zp_list[j] = tm_qtparam->zero_point;
                 }
@@ -469,16 +467,16 @@ static int load_graph_tensors(struct tm2_serializer* tm2_s, struct graph* graph,
 
 static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph, struct tm2_priv* priv)
 {
-    char* mem_base = ( char* )priv->base;
+    char* mem_base = (char*)priv->base;
     const TM2_Subgraph* tm_graph = priv->subgraph;
-    const TM2_Vector_offsets* v_nodes = ( TM2_Vector_offsets* )(mem_base + tm_graph->offset_vo_seq_nodes);
+    const TM2_Vector_offsets* v_nodes = (TM2_Vector_offsets*)(mem_base + tm_graph->offset_vo_seq_nodes);
 
     unsigned int i;
 
     for (i = 0; i < v_nodes->v_num; i++)
     {
-        const TM2_Node* tm_node = ( TM2_Node* )(mem_base + v_nodes->offsets[i]);
-        const TM2_Operator* tm_operator = ( TM2_Operator* )(mem_base + tm_node->offset_t_operator);
+        const TM2_Node* tm_node = (TM2_Node*)(mem_base + v_nodes->offsets[i]);
+        const TM2_Operator* tm_operator = (TM2_Operator*)(mem_base + tm_node->offset_t_operator);
         int op_type = tm_operator->operator_type;
         int op_version = tm_operator->op_ver;
 
@@ -509,7 +507,7 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph
 
         if (tm_node->offset_s_nname != TM2_NOT_SET)
         {
-            const TM2_String* str = ( TM2_String* )(mem_base + tm_node->offset_s_nname);
+            const TM2_String* str = (TM2_String*)(mem_base + tm_node->offset_s_nname);
             // TODO: update with new tm2
             ir_node->name = strdup_name(mem_base + str->offset_data, str->size);
         }
@@ -517,8 +515,7 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph
         /* node inputs */
         if (tm_node->offset_vi_input_tensors != TM2_NOT_SET)
         {
-            const TM2_Vector_indices* v_input_tensors =
-                    ( TM2_Vector_indices* )(mem_base + tm_node->offset_vi_input_tensors);
+            const TM2_Vector_indices* v_input_tensors = (TM2_Vector_indices*)(mem_base + tm_node->offset_vi_input_tensors);
 
             for (int j = 0; j < v_input_tensors->v_num; j++)
             {
@@ -542,8 +539,7 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph
             break;
         }
 
-        const TM2_Vector_indices* v_output_tensors =
-                ( TM2_Vector_indices* )(mem_base + tm_node->offset_vi_output_tensors);
+        const TM2_Vector_indices* v_output_tensors = (TM2_Vector_indices*)(mem_base + tm_node->offset_vi_output_tensors);
 
         for (int j = 0; j < v_output_tensors->v_num; j++)
         {
@@ -565,7 +561,7 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph
         {
             if (op_type == TM2_OPTYPE_SOFTMAX)
             {
-                TM2_SoftmaxParam* tm_param = ( TM2_SoftmaxParam* )(mem_base + tm_operator->offset_t_param);
+                TM2_SoftmaxParam* tm_param = (TM2_SoftmaxParam*)(mem_base + tm_operator->offset_t_param);
 
                 if (tm_param->axis == 3)
                     tm_param->axis = 1;
@@ -587,14 +583,14 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph
 
             if (op_type == TM2_OPTYPE_REDUCTION)
             {
-                TM2_ReductionParam* tm_param = ( TM2_ReductionParam* )(mem_base + tm_operator->offset_t_param);
+                TM2_ReductionParam* tm_param = (TM2_ReductionParam*)(mem_base + tm_operator->offset_t_param);
 
                 if (tm_param->dim_0 == 1 && tm_param->dim_1 == 2)
                 {
                     tm_param->dim_0 = 2;
                     tm_param->dim_1 = 3;
                 }
-                else if(tm_param->dim_0 == -1)
+                else if (tm_param->dim_0 == -1)
                 {
                     tm_param->dim_0 = 4;
                 }
@@ -606,35 +602,35 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph
 
             if (op_type == TM2_OPTYPE_PAD)
             {
-                TM2_PadParam* tm_param = ( TM2_PadParam* )(mem_base + tm_operator->offset_t_param);
+                TM2_PadParam* tm_param = (TM2_PadParam*)(mem_base + tm_operator->offset_t_param);
 
                 int pads[8] = {0};
-                pads[0] = tm_param->pad_n_0;    // n
+                pads[0] = tm_param->pad_n_0; // n
                 pads[1] = tm_param->pad_n_1;
 
-                pads[2] = tm_param->pad_c_0;    // h
+                pads[2] = tm_param->pad_c_0; // h
                 pads[3] = tm_param->pad_c_1;
 
-                pads[4] = tm_param->pad_h_0;    // w
+                pads[4] = tm_param->pad_h_0; // w
                 pads[5] = tm_param->pad_h_1;
 
-                pads[6] = tm_param->pad_w_0;    // c
+                pads[6] = tm_param->pad_w_0; // c
                 pads[7] = tm_param->pad_w_1;
 
                 /* nhwc to nchw */
-                tm_param->pad_c_0 = pads[6];    // c
+                tm_param->pad_c_0 = pads[6]; // c
                 tm_param->pad_c_1 = pads[7];
 
-                tm_param->pad_h_0 = pads[2];    // h
+                tm_param->pad_h_0 = pads[2]; // h
                 tm_param->pad_h_1 = pads[3];
 
-                tm_param->pad_w_0 = pads[4];    // w
+                tm_param->pad_w_0 = pads[4]; // w
                 tm_param->pad_w_1 = pads[5];
             }
 
             if (op_type == TM2_OPTYPE_STRIDEDSLICE)
             {
-                TM2_StridedSliceParam* tm_param = ( TM2_StridedSliceParam* )(mem_base + tm_operator->offset_t_param);
+                TM2_StridedSliceParam* tm_param = (TM2_StridedSliceParam*)(mem_base + tm_operator->offset_t_param);
 
                 int begin[4] = {0};
                 int end[4] = {0};
@@ -673,8 +669,8 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph
 
             if (op_type == TM2_OPTYPE_RESHAPE)
             {
-                TM2_ReshapeParam* tm_param = ( TM2_ReshapeParam* )(mem_base + tm_operator->offset_t_param);
-                TM2_Vector_dims* v_reshape = ( TM2_Vector_dims* )(mem_base + tm_param->offset_re_shape);
+                TM2_ReshapeParam* tm_param = (TM2_ReshapeParam*)(mem_base + tm_operator->offset_t_param);
+                TM2_Vector_dims* v_reshape = (TM2_Vector_dims*)(mem_base + tm_param->offset_re_shape);
 
                 if (tm_param->offset_re_shape != TM2_NOT_SET)
                 {
@@ -737,12 +733,12 @@ static int load_graph_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph
 
 static int set_graph_io_nodes(struct tm2_serializer* tm2_s, struct graph* ir_graph, struct tm2_priv* priv)
 {
-    char* mem_base = ( char* )priv->base;
+    char* mem_base = (char*)priv->base;
     const TM2_Subgraph* tm_graph = priv->subgraph;
-    const TM2_Vector_indices* v_input_nodes = ( TM2_Vector_indices* )(mem_base + tm_graph->offset_vi_input_indices);
-    const TM2_Vector_indices* v_output_nodes = ( TM2_Vector_indices* )(mem_base + tm_graph->offset_vi_output_indices);
+    const TM2_Vector_indices* v_input_nodes = (TM2_Vector_indices*)(mem_base + tm_graph->offset_vi_input_indices);
+    const TM2_Vector_indices* v_output_nodes = (TM2_Vector_indices*)(mem_base + tm_graph->offset_vi_output_indices);
 
-    int16_t* node_idx = ( int16_t* )sys_malloc(sizeof(int16_t) * v_input_nodes->v_num);
+    int16_t* node_idx = (int16_t*)sys_malloc(sizeof(int16_t) * v_input_nodes->v_num);
 
     if (node_idx == NULL)
     {
@@ -758,7 +754,7 @@ static int set_graph_io_nodes(struct tm2_serializer* tm2_s, struct graph* ir_gra
 
     sys_free(node_idx);
 
-    node_idx = ( int16_t* )sys_malloc(sizeof(int16_t) * v_output_nodes->v_num);
+    node_idx = (int16_t*)sys_malloc(sizeof(int16_t) * v_output_nodes->v_num);
 
     for (unsigned int i = 0; i < v_output_nodes->v_num; i++)
     {
@@ -774,10 +770,10 @@ static int set_graph_io_nodes(struct tm2_serializer* tm2_s, struct graph* ir_gra
 
 static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, struct tm2_priv* priv)
 {
-    char* mem_base = ( char* )priv->base;
-    const TM2_Vector_offsets* v_graphs = ( TM2_Vector_offsets* )(mem_base + priv->model->offset_vo_subgraphs);
+    char* mem_base = (char*)priv->base;
+    const TM2_Vector_offsets* v_graphs = (TM2_Vector_offsets*)(mem_base + priv->model->offset_vo_subgraphs);
     const TM2_Subgraph* tm_graph = priv->subgraph;
-    const TM2_Vector_offsets* v_sub_info = ( TM2_Vector_offsets* )(mem_base + tm_graph->offset_vo_sub_info);
+    const TM2_Vector_offsets* v_sub_info = (TM2_Vector_offsets*)(mem_base + tm_graph->offset_vo_sub_info);
 
     if (v_sub_info == TM2_NOT_SET || v_graphs->v_num == 1)
     {
@@ -789,10 +785,10 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st
     int sub_graph_num = v_sub_info->v_num;
     for (int i = 0; i < sub_graph_num; i++)
     {
-        struct subgraph* subgraph = ( struct subgraph* )sys_malloc(sizeof(struct subgraph));
+        struct subgraph* subgraph = (struct subgraph*)sys_malloc(sizeof(struct subgraph));
         init_ir_subgraph(graph, subgraph, i);
 
-        TM2_Sub_Info* sub_info = ( TM2_Sub_Info* )(mem_base + v_sub_info->offsets[i]);
+        TM2_Sub_Info* sub_info = (TM2_Sub_Info*)(mem_base + v_sub_info->offsets[i]);
         subgraph->index = sub_info->subgraph_id;
         subgraph->input_wait_count = sub_info->input_wait_count;
 
@@ -802,7 +798,7 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st
         // subgraph->nn_dev->name = strdup_name(mem_base + device_name->offset_data, device_name->size);
         char* name = (char*)(mem_base + device_name->offset_data);
 
-        TM2_Vector_indices* v_node_list = ( TM2_Vector_indices* )(mem_base + sub_info->offset_vi_node_list);
+        TM2_Vector_indices* v_node_list = (TM2_Vector_indices*)(mem_base + sub_info->offset_vi_node_list);
         subgraph->node_num = v_node_list->v_num;
         subgraph->node_list = (uint16_t*)sys_malloc(sizeof(uint16_t) * subgraph->node_num);
         for (int j = 0; j < v_node_list->v_num; j++)
@@ -810,7 +806,7 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st
             subgraph->node_list[j] = v_node_list->indices[j];
         }
 
-        TM2_Vector_indices* v_input_tensor = ( TM2_Vector_indices* )(mem_base + sub_info->offset_vi_input_tensor);
+        TM2_Vector_indices* v_input_tensor = (TM2_Vector_indices*)(mem_base + sub_info->offset_vi_input_tensor);
         subgraph->input_num = v_input_tensor->v_num;
         subgraph->input_tensor_list = (uint16_t*)sys_malloc(sizeof(uint16_t) * subgraph->input_num);
         for (int j = 0; j < v_input_tensor->v_num; j++)
@@ -818,7 +814,7 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st
             subgraph->input_tensor_list[j] = v_input_tensor->indices[j];
         }
 
-        TM2_Vector_indices* v_output_tensor = ( TM2_Vector_indices* )(mem_base + sub_info->offset_vi_output_tensor);
+        TM2_Vector_indices* v_output_tensor = (TM2_Vector_indices*)(mem_base + sub_info->offset_vi_output_tensor);
         subgraph->output_num = v_output_tensor->v_num;
         subgraph->output_tensor_list = (uint16_t*)sys_malloc(sizeof(uint16_t) * subgraph->output_num);
         for (int j = 0; j < v_output_tensor->v_num; j++)
@@ -838,7 +834,7 @@ static int load_graph_sub_info(struct tm2_serializer* s, struct graph* graph, st
 
 static int load_graph(struct serializer* s, struct graph* graph, struct tm2_priv* priv)
 {
-    struct tm2_serializer* tm2_s = ( struct tm2_serializer* )s;
+    struct tm2_serializer* tm2_s = (struct tm2_serializer*)s;
 
     /* version check */
     if (priv->header->ver_main != TM2_FILE_VER_MAIN)
@@ -861,7 +857,7 @@ static int load_graph(struct serializer* s, struct graph* graph, struct tm2_priv
 
     return 0;
 
-    error:
+error:
     unload_graph(s, graph, priv, NULL);
     return -1;
 }
@@ -891,10 +887,10 @@ static int load_model(struct serializer* s, struct graph* graph, const char* fna
     //        return -1;
     //    }
 
-    void* mem_base = ( void* )sys_malloc(file_len);
+    void* mem_base = (void*)sys_malloc(file_len);
     int ret = read(fd, mem_base, file_len);
 
-    struct tm2_priv* priv = ( struct tm2_priv* )sys_malloc(sizeof(struct tm2_priv));
+    struct tm2_priv* priv = (struct tm2_priv*)sys_malloc(sizeof(struct tm2_priv));
 
     if (priv == NULL)
     {
@@ -918,7 +914,7 @@ static int load_model(struct serializer* s, struct graph* graph, const char* fna
 
 static int load_mem(struct serializer* s, struct graph* graph, const void* addr, int size, va_list ap)
 {
-    struct tm2_priv* priv = ( struct tm2_priv* )sys_malloc(sizeof(struct tm2_priv));
+    struct tm2_priv* priv = (struct tm2_priv*)sys_malloc(sizeof(struct tm2_priv));
 
     if (priv == NULL)
     {
@@ -941,7 +937,7 @@ static int load_mem(struct serializer* s, struct graph* graph, const void* addr,
 
 static int unload_graph(struct serializer* s, struct graph* graph, void* s_priv, void* dev_priv)
 {
-    struct tm2_priv* priv = ( struct tm2_priv* )s_priv;
+    struct tm2_priv* priv = (struct tm2_priv*)s_priv;
 
     if (priv->fd >= 0)
     {
@@ -952,7 +948,7 @@ static int unload_graph(struct serializer* s, struct graph* graph, void* s_priv,
 
     if (priv->base)
     {
-        sys_free(( void* )priv->base);
+        sys_free((void*)priv->base);
         priv->base = NULL;
     }
 
@@ -969,7 +965,7 @@ static int unload_graph(struct serializer* s, struct graph* graph, void* s_priv,
 static int register_op_loader(struct serializer* s, int op_type, int op_ver, void* op_load_func, void* op_map_func,
                               void* ver_map_func)
 {
-    struct tm2_serializer* tm2_s = ( struct tm2_serializer* )s;
+    struct tm2_serializer* tm2_s = (struct tm2_serializer*)s;
     tm2_op_loader_t op_load = (tm2_op_loader_t)op_load_func;
     tm2_map_t op_map = (tm2_map_t)op_map_func;
     tm2_map_t ver_map = (tm2_map_t)ver_map_func;
@@ -979,7 +975,7 @@ static int register_op_loader(struct serializer* s, int op_type, int op_ver, voi
 
 static int unregister_op_loader(struct serializer* s, int op_type, int op_ver, void* op_load_func)
 {
-    struct tm2_serializer* tm2_s = ( struct tm2_serializer* )s;
+    struct tm2_serializer* tm2_s = (struct tm2_serializer*)s;
     tm2_op_loader_t op_load = (tm2_op_loader_t)op_load_func;
 
     return unregister_tm2_op_loader(tm2_s, op_type, op_ver, op_load);
@@ -1002,7 +998,7 @@ static int input_op_map(int op)
 
 static int init_tm2_serializer(struct serializer* s)
 {
-    struct tm2_serializer* tm2_s = ( struct tm2_serializer* )s;
+    struct tm2_serializer* tm2_s = (struct tm2_serializer*)s;
 
     tm2_s->loader_list = create_vector(sizeof(struct op_loader_entry), NULL);
 
@@ -1028,27 +1024,24 @@ static int release_tm2_serializer(struct serializer* s)
 }
 
 static struct tm2_serializer tm2_serializer = {
-        .base =
-                {
-                        .get_name = get_name,
-                        .load_model = load_model,
-                        .load_mem = load_mem,
-                        .unload_graph = unload_graph,
-                        .register_op_loader = register_op_loader,
-                        .unregister_op_loader = unregister_op_loader,
-                        .init = init_tm2_serializer,
-                        .release = release_tm2_serializer,
-                },
-        .loader_list = NULL,
+    .base = {
+        .get_name = get_name,
+        .load_model = load_model,
+        .load_mem = load_mem,
+        .unload_graph = unload_graph,
+        .register_op_loader = register_op_loader,
+        .unregister_op_loader = unregister_op_loader,
+        .init = init_tm2_serializer,
+        .release = release_tm2_serializer,
+    },
+    .loader_list = NULL,
 };
 
-
 int register_tm2_serializer()
 {
     return register_serializer((struct serializer*)&tm2_serializer);
 }
 
-
 int unregister_tm2_serializer()
 {
     return unregister_serializer((struct serializer*)&tm2_serializer);
diff --git a/source/serializer/tmfile/tm2_serializer.h b/source/serializer/tmfile/tm2_serializer.h
index 23e2edf21..1e0887fe7 100644
--- a/source/serializer/tmfile/tm2_serializer.h
+++ b/source/serializer/tmfile/tm2_serializer.h
@@ -31,19 +31,16 @@
 struct node;
 struct graph;
 
-
 struct tm2_priv
 {
     int fd; /* for file load */
     int mem_len;
-    const char* base; /* mem base for model */
-    const TM2_Header* header; /* file header */
-    const TM2_Model* model; /* model header */
+    const char* base;             /* mem base for model */
+    const TM2_Header* header;     /* file header */
+    const TM2_Model* model;       /* model header */
     const TM2_Subgraph* subgraph; /* subgraph */
 };
 
-
 typedef int (*tm2_op_loader_t)(struct graph*, struct node*, const TM2_Node*, const TM2_Operator* tm_op);
 
-
 typedef int (*tm2_map_t)(int);
diff --git a/source/system/cpu.c b/source/system/cpu.c
index 7feffa8ac..87a6007a0 100644
--- a/source/system/cpu.c
+++ b/source/system/cpu.c
@@ -174,7 +174,7 @@ static int get_max_freq_khz(int cpuid)
 
             fclose(fp);
 
-            if (max_freq_khz <=0 && EOF == ret)
+            if (max_freq_khz <= 0 && EOF == ret)
                 return -1;
             else
                 return max_freq_khz;
@@ -206,7 +206,7 @@ static int set_sched_affinity(size_t thread_affinity_mask)
 #define CPU_SETSIZE 1024
 #endif
 #ifndef __NCPUBITS
-#define __NCPUBITS  (8 * sizeof (unsigned long))
+#define __NCPUBITS (8 * sizeof(unsigned long))
 #endif
 
     typedef struct
@@ -222,7 +222,7 @@ static int set_sched_affinity(size_t thread_affinity_mask)
 #if (defined __GLIBC__) || (defined _OHOS_) || (defined V831)
     pid_t pid = syscall(SYS_gettid);
 #else
-    #ifdef PI3
+#ifdef PI3
     pid_t pid = getpid();
 #else
 
@@ -237,7 +237,7 @@ static int set_sched_affinity(size_t thread_affinity_mask)
 #endif
     cpu_set_t mask;
     CPU_ZERO(&mask);
-//    for (int i = 0; i < ( int )sizeof(size_t) * 8; i++)
+    //    for (int i = 0; i < ( int )sizeof(size_t) * 8; i++)
     for (int i = 0; i < core_count; i++)
     {
         if (thread_affinity_mask & (1 << i))
@@ -361,13 +361,13 @@ int set_cpu_affine(size_t mask)
 
 #elif __APPLE_IOS__ || _MSC_VER
     // threads affinity not supported on ios
-    ( void )mask;
+    (void)mask;
     return -1;
 #else
     int status = set_sched_affinity(mask);
     if (0 != status) return -1;
 
-	return 0;
+    return 0;
 #endif
 
     return 0;
@@ -377,20 +377,20 @@ size_t get_cpu_cluster_mask(int cluster)
 {
     switch (cluster)
     {
-        case TENGINE_CLUSTER_BIG:
-            if (0 != affinity_mask_big_cluster)
-                return affinity_mask_big_cluster;
-            break;
-        case TENGINE_CLUSTER_MEDIUM:
-            if (0 != affinity_mask_medium_cluster)
-                return affinity_mask_medium_cluster;
-            break;
-        case TENGINE_CLUSTER_LITTLE:
-            if (0 != affinity_mask_little_cluster)
-                return affinity_mask_little_cluster;
-            break;
-        default:
-            break;
+    case TENGINE_CLUSTER_BIG:
+        if (0 != affinity_mask_big_cluster)
+            return affinity_mask_big_cluster;
+        break;
+    case TENGINE_CLUSTER_MEDIUM:
+        if (0 != affinity_mask_medium_cluster)
+            return affinity_mask_medium_cluster;
+        break;
+    case TENGINE_CLUSTER_LITTLE:
+        if (0 != affinity_mask_little_cluster)
+            return affinity_mask_little_cluster;
+        break;
+    default:
+        break;
     }
 
     return affinity_mask_all_cluster;
diff --git a/source/utility/float.c b/source/utility/float.c
index e8bd5e6f2..0496a2d7a 100644
--- a/source/utility/float.c
+++ b/source/utility/float.c
@@ -24,18 +24,17 @@
 
 #include "utility/float.h"
 
-#define BF16_EXP_MAX  ( 256 - 1)   //  2^8 - 1
-#define FP16_EXP_MAX  (  32 - 1)   //  2^5 - 1
-#define FP32_EXP_MAX  ( 256 - 1)   //  2^8 - 1
-#define FP64_EXP_MAX  (2048 - 1)   // 2^11 - 1
-
-#define FP16_NAN      ((FP16_EXP_MAX << 10) + 1)
-#define FP16_INF      ((FP16_EXP_MAX << 10) + 0)
-#define BF16_NAN      ((BF16_EXP_MAX <<  7) + 1)
-#define BF16_INF      ((BF16_EXP_MAX <<  7) + 0)
-#define FP32_NAN      ((FP32_EXP_MAX << 23) + 1)
-#define FP32_INF      ((FP32_EXP_MAX << 23) + 0)
-
+#define BF16_EXP_MAX (256 - 1)  //  2^8 - 1
+#define FP16_EXP_MAX (32 - 1)   //  2^5 - 1
+#define FP32_EXP_MAX (256 - 1)  //  2^8 - 1
+#define FP64_EXP_MAX (2048 - 1) // 2^11 - 1
+
+#define FP16_NAN ((FP16_EXP_MAX << 10) + 1)
+#define FP16_INF ((FP16_EXP_MAX << 10) + 0)
+#define BF16_NAN ((BF16_EXP_MAX << 7) + 1)
+#define BF16_INF ((BF16_EXP_MAX << 7) + 0)
+#define FP32_NAN ((FP32_EXP_MAX << 23) + 1)
+#define FP32_INF ((FP32_EXP_MAX << 23) + 0)
 
 #ifndef __ARM_ARCH
 fp32_t fp16_to_fp32(fp16_t package)
@@ -55,7 +54,7 @@ fp32_t fp16_to_fp32(fp16_t package)
     if (FP16_EXP_MAX != package.exp && 0 != package.exp && 0 != package.frac)
     {
         data.frac = package.frac << 13;
-        data.exp  = package.exp + (- 15 + 127);
+        data.exp = package.exp + (-15 + 127);
         data.sign = package.sign;
 
         return data.value;
@@ -65,7 +64,7 @@ fp32_t fp16_to_fp32(fp16_t package)
     if (FP16_EXP_MAX == package.exp && 0 == package.frac)
     {
         data.frac = 0;
-        data.exp  = FP32_EXP_MAX;
+        data.exp = FP32_EXP_MAX;
         data.sign = package.sign;
 
         return data.value;
@@ -75,7 +74,7 @@ fp32_t fp16_to_fp32(fp16_t package)
     if (FP16_EXP_MAX == package.exp && 0 != package.frac)
     {
         data.frac = 1;
-        data.exp  = FP32_EXP_MAX;
+        data.exp = FP32_EXP_MAX;
         data.sign = package.sign;
 
         return data.value;
@@ -85,7 +84,7 @@ fp32_t fp16_to_fp32(fp16_t package)
     if (0 == package.exp && 0 != package.frac)
     {
         uint16_t frac = package.frac;
-        uint16_t exp  = 0;
+        uint16_t exp = 0;
 
         while (0 == (frac & (uint16_t)0x200))
         {
@@ -94,7 +93,7 @@ fp32_t fp16_to_fp32(fp16_t package)
         }
 
         data.frac = (frac << 1) & (uint16_t)0x3FF;
-        data.exp  = -exp + (-15 + 127);
+        data.exp = -exp + (-15 + 127);
         data.sign = package.sign;
 
         return data.value;
@@ -103,7 +102,6 @@ fp32_t fp16_to_fp32(fp16_t package)
     return data.value;
 }
 
-
 fp16_t fp32_to_fp16(fp32_t value)
 {
     fp32_pack_t* package = (fp32_pack_t*)(&value);
@@ -113,7 +111,7 @@ fp16_t fp32_to_fp16(fp32_t value)
     if (0 == package->exp)
     {
         data.value = 0;
-        data.sign  = package->sign;
+        data.sign = package->sign;
 
         return data;
     }
@@ -121,13 +119,13 @@ fp16_t fp32_to_fp16(fp32_t value)
     // means normalized value
     if (FP32_EXP_MAX != package->exp && 0 != package->exp && 0 != package->frac)
     {
-        int16_t exp  = package->exp + (-15 + 127);
+        int16_t exp = package->exp + (-15 + 127);
 
         // means overflow
         if (31 <= exp)
         {
             data.frac = 0;
-            data.exp  = FP16_EXP_MAX;
+            data.exp = FP16_EXP_MAX;
             data.sign = package->sign;
         }
         else if (0 >= exp)
@@ -135,21 +133,21 @@ fp16_t fp32_to_fp16(fp32_t value)
             // means subnormal numbers
             if (-10 <= exp)
             {
-                data.frac  = (package->frac | 0x800000) >> (14 - exp);
-                data.exp   = 0;
-                data.sign  = package->sign;
+                data.frac = (package->frac | 0x800000) >> (14 - exp);
+                data.exp = 0;
+                data.sign = package->sign;
             }
             // means underflow
             else
             {
                 data.value = 0;
-                data.sign  = package->sign;
+                data.sign = package->sign;
             }
         }
         else
         {
             data.frac = package->frac >> 13;
-            data.exp  = exp;
+            data.exp = exp;
             data.sign = package->sign;
         }
 
@@ -160,7 +158,7 @@ fp16_t fp32_to_fp16(fp32_t value)
     if (FP32_EXP_MAX == package->exp && 0 == package->frac)
     {
         data.frac = 0;
-        data.exp  = FP16_EXP_MAX;
+        data.exp = FP16_EXP_MAX;
         data.sign = package->sign;
 
         return data;
@@ -170,7 +168,7 @@ fp16_t fp32_to_fp16(fp32_t value)
     if (FP32_EXP_MAX == package->exp && 0 != package->frac)
     {
         data.frac = 1;
-        data.exp  = FP16_EXP_MAX;
+        data.exp = FP16_EXP_MAX;
         data.sign = package->sign;
 
         return data;
@@ -181,7 +179,6 @@ fp16_t fp32_to_fp16(fp32_t value)
 }
 #endif
 
-
 fp32_t bf16_to_fp32(bf16_t package)
 {
     fp32_pack_t data;
@@ -189,7 +186,6 @@ fp32_t bf16_to_fp32(bf16_t package)
     return data.value;
 }
 
-
 bf16_t fp32_to_bf16(fp32_t value)
 {
     fp32_pack_t* package = (fp32_pack_t*)(&value);
@@ -198,7 +194,6 @@ bf16_t fp32_to_bf16(fp32_t value)
     return data;
 }
 
-
 #ifndef _MSC_VER
 fp32_t pxr24_to_fp32(pxr24_pack_t package)
 {
@@ -210,7 +205,6 @@ fp32_t pxr24_to_fp32(pxr24_pack_t package)
     return data.value;
 }
 
-
 pxr24_pack_t fp32_to_pxr24(fp32_t value)
 {
     fp32_pack_t* package = (fp32_pack_t*)(&value);
@@ -220,7 +214,7 @@ pxr24_pack_t fp32_to_pxr24(fp32_t value)
     pxr24_pack_t* ptr = (pxr24_pack_t*)((uint8_t*)(&pxr24_val));
 
     data.frac = ptr->frac;
-    data.exp  = ptr->exp;
+    data.exp = ptr->exp;
     data.sign = ptr->sign;
 
     return data;
diff --git a/source/utility/float.h b/source/utility/float.h
index a02f1a518..e7fdef127 100644
--- a/source/utility/float.h
+++ b/source/utility/float.h
@@ -39,64 +39,59 @@
 // IEEE 754
 // ISO/IEC/IEEE FDIS 60559:2010
 
-
 #ifdef _MSC_VER
-#pragma pack (push,1)
+#pragma pack(push, 1)
 #endif
 typedef union fp16_pack
 {
     struct
     {
         uint16_t frac : 10;
-        uint16_t exp  :  5;
-        uint16_t sign :  1;
+        uint16_t exp : 5;
+        uint16_t sign : 1;
     } PACKAGE_MARK;
     uint16_t value;
 } PACKAGE_MARK fp16_pack_t;
 
-
 typedef union bf16_pack
 {
     struct
     {
-        uint16_t frac :  7;
-        uint16_t exp  :  8;
-        uint16_t sign :  1;
+        uint16_t frac : 7;
+        uint16_t exp : 8;
+        uint16_t sign : 1;
     } PACKAGE_MARK;
     uint16_t value;
 } PACKAGE_MARK bf16_pack_t;
 
-
 #ifdef _MSC_VER
 typedef struct afp24_pack
 {
     uint16_t frac : 16;
-    uint8_t  exp  :  7;
-    uint8_t  sign :  1;
+    uint8_t exp : 7;
+    uint8_t sign : 1;
 } afp24_pack_t;
 
-
 typedef struct pxr24_pack
 {
     uint16_t frac : 15;
-    uint16_t      :  1;
-    uint8_t       :  7;
-    uint8_t  sign :  1;
+    uint16_t : 1;
+    uint8_t : 7;
+    uint8_t sign : 1;
 } pxr24_pack_t;
 #else
 typedef struct afp24_pack
 {
     uint32_t frac : 16;
-    uint32_t exp  :  7;
-    uint32_t sign :  1;
+    uint32_t exp : 7;
+    uint32_t sign : 1;
 } PACKAGE_MARK afp24_pack_t;
 
-
 typedef struct pxr24_pack
 {
     uint32_t frac : 15;
-    uint32_t exp  :  8;
-    uint32_t sign :  1;
+    uint32_t exp : 8;
+    uint32_t sign : 1;
 } PACKAGE_MARK pxr24_pack_t;
 #endif
 
@@ -105,20 +100,19 @@ typedef union fp32_pack
     struct
     {
         uint32_t frac : 23;
-        uint32_t exp  :  8;
-        uint32_t sign :  1;
+        uint32_t exp : 8;
+        uint32_t sign : 1;
     } PACKAGE_MARK;
     float value;
 } PACKAGE_MARK fp32_pack_t;
 
-
 typedef union fp64_pack
 {
     struct
     {
         uint64_t frac : 52;
-        uint64_t  exp : 11;
-        uint64_t sign :  1;
+        uint64_t exp : 11;
+        uint64_t sign : 1;
     } PACKAGE_MARK;
     double value;
 } PACKAGE_MARK fp64_pack_t;
@@ -126,16 +120,14 @@ typedef union fp64_pack
 #pragma pack(pop)
 #endif
 
-
 #ifdef __ARM_ARCH
-typedef __fp16      fp16_t;
+typedef __fp16 fp16_t;
 #else
 typedef fp16_pack_t fp16_t;
 #endif
 typedef bf16_pack_t bf16_t;
-typedef float       fp32_t;
-typedef double      fp64_t;
-
+typedef float fp32_t;
+typedef double fp64_t;
 
 #ifndef __ARM_ARCH
 /*!
@@ -147,7 +139,6 @@ typedef double      fp64_t;
  */
 fp32_t fp16_to_fp32(fp16_t package);
 
-
 /*!
  * @brief  Convert a number from float32 to float16.
  *
@@ -158,7 +149,6 @@ fp32_t fp16_to_fp32(fp16_t package);
 fp16_t fp32_to_fp16(fp32_t package);
 #endif
 
-
 /*!
  * @brief  Convert a number from float16 to float32.
  *
@@ -168,7 +158,6 @@ fp16_t fp32_to_fp16(fp32_t package);
  */
 fp32_t bf16_to_fp32(bf16_t package);
 
-
 /*!
  * @brief  Convert a number from float32 to float16.
  *
@@ -178,7 +167,6 @@ fp32_t bf16_to_fp32(bf16_t package);
  */
 bf16_t fp32_to_bf16(fp32_t package);
 
-
 #ifdef __ARM_ARCH
 #define fp16_to_fp32(data) ({ float f = data; f; })
 #define fp32_to_fp16(data) ({ __fp16 f = data; f; })
diff --git a/source/utility/lock.c b/source/utility/lock.c
index 6d299b6d3..f3de78577 100644
--- a/source/utility/lock.c
+++ b/source/utility/lock.c
@@ -27,26 +27,22 @@
 #include "defines.h"
 #include "utility/sys_port.h"
 
-
 static inline void bare_metal_mutex_init(mutex_t* mutex)
 {
     mutex->locker = sys_malloc(sizeof(mutex->locker));
     *((int*)(mutex->locker)) = 0;
 }
 
-
 static inline void bare_metal_mutex_lock(mutex_t* mutex)
 {
     *((int*)(mutex->locker)) = 1;
 }
 
-
 static inline void bare_metal_mutex_unlock(mutex_t* mutex)
 {
     *((int*)(mutex->locker)) = 0;
 }
 
-
 static inline void bare_metal_mutex_free(mutex_t* mutex)
 {
     if (NULL != mutex->locker)
@@ -57,43 +53,34 @@ static inline void bare_metal_mutex_free(mutex_t* mutex)
     mutex->locker = NULL;
 }
 
-
 // for WIN MSVC
 
-
-
-
 #ifdef TENGINE_HAS_LIB_POSIX_THREAD
 #include <pthread.h>
 
 typedef pthread_mutex_t lock_t;
 
-
 static inline void posix_thread_mutex_init(mutex_t* mutex)
 {
     mutex->locker = sys_malloc(sizeof(lock_t));
     pthread_mutex_init((lock_t*)mutex->locker, NULL);
 }
 
-
 static inline void posix_thread_mutex_lock(mutex_t* mutex)
 {
     pthread_mutex_lock((lock_t*)mutex->locker);
 }
 
-
 static inline void posix_thread_mutex_unlock(mutex_t* mutex)
 {
     pthread_mutex_unlock((lock_t*)mutex->locker);
 }
 
-
 static inline void posix_thread_mutex_free(mutex_t* mutex)
 {
     return bare_metal_mutex_free(mutex);
 }
 
-
 void init_mutex(mutex_t* mutex)
 {
     mutex->init = posix_thread_mutex_init;
@@ -108,14 +95,12 @@ void init_mutex(mutex_t* mutex)
 
 typedef CRITICAL_SECTION lock_t;
 
-
-static inline void  win_mutex_init(mutex_t* mutex)
+static inline void win_mutex_init(mutex_t* mutex)
 {
     mutex->locker = sys_malloc(sizeof(lock_t));
     InitializeCriticalSection((lock_t*)mutex->locker);
 }
 
-
 static inline void win_mutex_lock(mutex_t* mutex)
 {
     if (NULL != mutex->locker)
@@ -124,7 +109,6 @@ static inline void win_mutex_lock(mutex_t* mutex)
     }
 }
 
-
 static inline void win_mutex_unlock(mutex_t* mutex)
 {
     if (NULL != mutex->locker)
@@ -133,13 +117,11 @@ static inline void win_mutex_unlock(mutex_t* mutex)
     }
 }
 
-
 static inline void win_mutex_free(mutex_t* mutex)
 {
     return bare_metal_mutex_free(mutex);
 }
 
-
 void init_mutex(mutex_t* mutex)
 {
     mutex->init = win_mutex_init;
@@ -161,27 +143,17 @@ void init_mutex(mutex_t* mutex)
 }
 #endif // end TENGINE_HAS_LIB_POSIX_THREAD
 
-
 void lock_mutex(mutex_t* mutex)
 {
     return mutex->lock(mutex);
 }
 
-
 void unlock_mutex(mutex_t* mutex)
 {
     return mutex->unlock(mutex);
 }
 
-
 void free_mutex(mutex_t* mutex)
 {
     return mutex->free(mutex);
 }
-
-
-
-
-
-
-
diff --git a/source/utility/lock.h b/source/utility/lock.h
index b4d7ee4c3..502fc99a4 100644
--- a/source/utility/lock.h
+++ b/source/utility/lock.h
@@ -24,21 +24,19 @@
 
 #pragma once
 
-
 /*!
  * @struct abstract_mutex
  * @brief  Abstract mutex_t, platform independence
  */
 typedef struct abstract_mutex
 {
-    void* locker;                                    //!< platform dependence mutex impl
-    void (*init)(struct abstract_mutex* mutex);      //!< init this mutex
-    void (*lock)(struct abstract_mutex* mutex);      //!< lock this mutex
-    void (*unlock)(struct abstract_mutex* mutex);    //!< unlock this mutex
-    void (*free)(struct abstract_mutex* mutex);      //!< destroy this mutex
+    void* locker;                                 //!< platform dependence mutex impl
+    void (*init)(struct abstract_mutex* mutex);   //!< init this mutex
+    void (*lock)(struct abstract_mutex* mutex);   //!< lock this mutex
+    void (*unlock)(struct abstract_mutex* mutex); //!< unlock this mutex
+    void (*free)(struct abstract_mutex* mutex);   //!< destroy this mutex
 } mutex_t;
 
-
 /*!
  * @brief Init a abstract mutex.
  *
@@ -46,7 +44,6 @@ typedef struct abstract_mutex
  */
 void init_mutex(mutex_t* mutex);
 
-
 /*!
  * @brief Init a abstract mutex.
  *
@@ -54,7 +51,6 @@ void init_mutex(mutex_t* mutex);
  */
 void lock_mutex(mutex_t* mutex);
 
-
 /*!
  * @brief Init a abstract mutex.
  *
@@ -62,7 +58,6 @@ void lock_mutex(mutex_t* mutex);
  */
 void unlock_mutex(mutex_t* mutex);
 
-
 /*!
  * @brief Init a abstract mutex.
  *
diff --git a/source/utility/log.c b/source/utility/log.c
index 317a2cd14..1382d42cc 100644
--- a/source/utility/log.c
+++ b/source/utility/log.c
@@ -29,7 +29,6 @@
 #include "api/c_api.h"
 #include "utility/lock.h"
 
-
 #include <stdio.h>
 #include <time.h>
 #include <stdarg.h>
@@ -38,11 +37,9 @@
 #include <android/log.h>
 #endif
 
-
 static mutex_t log_locker;
 static const char* map_table[] = {"EMERG", "ALERT", "CRIT", "ERROR", "WARN", "NOTICE", "INFO", "DEBUG"};
 
-
 static void safety_log(struct logger* logger, char* message)
 {
     if (0 != message[TE_MAX_LOG_LENGTH - 1])
@@ -55,7 +52,6 @@ static void safety_log(struct logger* logger, char* message)
     unlock_mutex(&log_locker);
 }
 
-
 static void do_log(struct logger* logger, enum log_level level, const char* fmt, ...)
 {
     if (logger->log_level < level || level > LOG_DEBUG)
@@ -68,47 +64,47 @@ static void do_log(struct logger* logger, enum log_level level, const char* fmt,
 
     switch (level)
     {
-        case LOG_EMERG:
-        case LOG_ALERT:
-        case LOG_CRIT:
-        {
-            __android_log_print(ANDROID_LOG_FATAL, "Tengine", fmt, _ap);
-            break;
-        }
-        case LOG_ERR:
-        {
-            __android_log_print(ANDROID_LOG_ERROR, "Tengine", fmt, _ap);
-            break;
-        }
-        case LOG_WARNING:
-        {
-            __android_log_print(ANDROID_LOG_WARN, "Tengine", fmt, _ap);
-            break;
-        }
-        case LOG_NOTICE:
-        case LOG_INFO:
-        {
-            __android_log_print(ANDROID_LOG_INFO, "Tengine", fmt, _ap);
-            break;
-        }
-        case LOG_DEBUG:
-        {
-            __android_log_print(ANDROID_LOG_DEBUG, "Tengine", fmt, _ap);
-            break;
-        }
-        default:
-        {
-            __android_log_print(ANDROID_LOG_VERBOSE, "Tengine", fmt, _ap);
-        }
+    case LOG_EMERG:
+    case LOG_ALERT:
+    case LOG_CRIT:
+    {
+        __android_log_print(ANDROID_LOG_FATAL, "Tengine", fmt, _ap);
+        break;
+    }
+    case LOG_ERR:
+    {
+        __android_log_print(ANDROID_LOG_ERROR, "Tengine", fmt, _ap);
+        break;
+    }
+    case LOG_WARNING:
+    {
+        __android_log_print(ANDROID_LOG_WARN, "Tengine", fmt, _ap);
+        break;
+    }
+    case LOG_NOTICE:
+    case LOG_INFO:
+    {
+        __android_log_print(ANDROID_LOG_INFO, "Tengine", fmt, _ap);
+        break;
+    }
+    case LOG_DEBUG:
+    {
+        __android_log_print(ANDROID_LOG_DEBUG, "Tengine", fmt, _ap);
+        break;
+    }
+    default:
+    {
+        __android_log_print(ANDROID_LOG_VERBOSE, "Tengine", fmt, _ap);
+    }
     }
     va_end(_ap);
 
     return;
 #else
     va_list ap;
-    char msg[TE_MAX_LOG_LENGTH] = { 0 };
-    int  max_len = TE_MAX_LOG_LENGTH;
-    int  left = max_len;
+    char msg[TE_MAX_LOG_LENGTH] = {0};
+    int max_len = TE_MAX_LOG_LENGTH;
+    int left = max_len;
     char* p = msg;
     int ret;
 
@@ -157,7 +153,6 @@ static void do_log(struct logger* logger, enum log_level level, const char* fmt,
 #endif
 }
 
-
 static void change_log_level(struct logger* logger, int level)
 {
     if (level < 0 || level > LOG_DEBUG)
@@ -168,19 +163,16 @@ static void change_log_level(struct logger* logger, int level)
     logger->log_level = level;
 }
 
-
 static void set_output_func(struct logger* logger, void (*func)(const char*))
 {
     logger->output_func = func;
 }
 
-
 static void output_stderr(const char* msg)
 {
     fprintf(stderr, "%s", msg);
 }
 
-
 struct logger* get_default_logger(void)
 {
     static int inited = 0;
diff --git a/source/utility/log.h b/source/utility/log.h
index 7126950d2..993a5e4e8 100644
--- a/source/utility/log.h
+++ b/source/utility/log.h
@@ -27,7 +27,6 @@
 
 #include "api/c_api.h"
 
-
 struct log_option
 {
     int print_prefix;
@@ -35,7 +34,6 @@ struct log_option
     int print_level;
 };
 
-
 struct logger
 {
     const char* prefix;
@@ -49,68 +47,66 @@ struct logger
     void (*set_output_func)(struct logger*, void (*func)(const char*));
 };
 
-
 struct logger* get_default_logger(void);
 
-
-#define SET_LOG_OUTPUT(func)                              \
-    do                                                    \
-    {                                                     \
-        struct logger* logger = get_default_logger();     \
-        logger->set_output_func(logger, func);            \
-    } while(0)
-
-#define SET_LOG_LEVEL(level)                              \
-    do                                                    \
-    {                                                     \
-        struct logger* logger = get_default_logger();     \
-        logger->set_log_level(logger, level);             \
-    } while(0)
-
-#define SET_LOG_PRINT_TIME(val)                           \
-    do                                                    \
-    {                                                     \
-        struct logger* logger = get_default_logger();     \
-        logger->option.print_time = val;                  \
-    } while(0)
-
-#define SET_LOG_PRINT_LEVEL(val)                          \
-    do                                                    \
-    {                                                     \
-        struct logger* logger = get_default_logger();     \
-        logger->option.print_level = val;                 \
-    } while(0)
-
-#define SET_LOG_PRINT_PREFIX(val)                         \
-    do                                                    \
-    {                                                     \
-        struct logger* logger = get_default_logger();     \
-        logger->option.print_prefix = val;                \
-    } while(0)
-
-#define SET_LOG_PREFIX(prefix)                            \
-    do                                                    \
-    {                                                     \
-        struct logger* logger = get_default_logger();     \
-        logger->prefix = prefix;                          \
-    } while(0)
-
-#define LOG(level, fmt, ...)                              \
-    do                                                    \
-    {                                                     \
-        struct logger* logger = get_default_logger();     \
-        logger->log(logger, level, fmt, ##__VA_ARGS__);   \
-    } while(0)
-
-#define TLOG_EMERG(fmt, ...)    LOG(LOG_EMERG, fmt, ##__VA_ARGS__)
-#define TLOG_ALERT(fmt, ...)    LOG(LOG_ALERT, fmt, ##__VA_ARGS__)
-#define TLOG_CRIT(fmt, ...)     LOG(LOG_CRIT, fmt, ##__VA_ARGS__)
-#define TLOG_ERR(fmt, ...)      LOG(LOG_ERR, fmt, ##__VA_ARGS__)
-#define TLOG_WARNING(fmt, ...)  LOG(LOG_WARNING, fmt, ##__VA_ARGS__)
-#define TLOG_NOTICE(fmt, ...)   LOG(LOG_NOTICE, fmt, ##__VA_ARGS__)
-#define TLOG_INFO(fmt, ...)     LOG(LOG_INFO, fmt, ##__VA_ARGS__)
-#define TLOG_DEBUG(fmt, ...)    LOG(LOG_DEBUG, fmt, ##__VA_ARGS__)
-
-#define XLOG(level, fmt, ...)                             \
-    LOG(level, "%s:%d ", __FILE__, __LINE__);             \
+#define SET_LOG_OUTPUT(func)                          \
+    do                                                \
+    {                                                 \
+        struct logger* logger = get_default_logger(); \
+        logger->set_output_func(logger, func);        \
+    } while (0)
+
+#define SET_LOG_LEVEL(level)                          \
+    do                                                \
+    {                                                 \
+        struct logger* logger = get_default_logger(); \
+        logger->set_log_level(logger, level);         \
+    } while (0)
+
+#define SET_LOG_PRINT_TIME(val)                       \
+    do                                                \
+    {                                                 \
+        struct logger* logger = get_default_logger(); \
+        logger->option.print_time = val;              \
+    } while (0)
+
+#define SET_LOG_PRINT_LEVEL(val)                      \
+    do                                                \
+    {                                                 \
+        struct logger* logger = get_default_logger(); \
+        logger->option.print_level = val;             \
+    } while (0)
+
+#define SET_LOG_PRINT_PREFIX(val)                     \
+    do                                                \
+    {                                                 \
+        struct logger* logger = get_default_logger(); \
+        logger->option.print_prefix = val;            \
+    } while (0)
+
+#define SET_LOG_PREFIX(prefix)                        \
+    do                                                \
+    {                                                 \
+        struct logger* logger = get_default_logger(); \
+        logger->prefix = prefix;                      \
+    } while (0)
+
+#define LOG(level, fmt, ...)                            \
+    do                                                  \
+    {                                                   \
+        struct logger* logger = get_default_logger();   \
+        logger->log(logger, level, fmt, ##__VA_ARGS__); \
+    } while (0)
+
+#define TLOG_EMERG(fmt, ...)   LOG(LOG_EMERG, fmt, ##__VA_ARGS__)
+#define TLOG_ALERT(fmt, ...)   LOG(LOG_ALERT, fmt, ##__VA_ARGS__)
+#define TLOG_CRIT(fmt, ...)    LOG(LOG_CRIT, fmt, ##__VA_ARGS__)
+#define TLOG_ERR(fmt, ...)     LOG(LOG_ERR, fmt, ##__VA_ARGS__)
+#define TLOG_WARNING(fmt, ...) LOG(LOG_WARNING, fmt, ##__VA_ARGS__)
+#define TLOG_NOTICE(fmt, ...)  LOG(LOG_NOTICE, fmt, ##__VA_ARGS__)
+#define TLOG_INFO(fmt, ...)    LOG(LOG_INFO, fmt, ##__VA_ARGS__)
+#define TLOG_DEBUG(fmt, ...)   LOG(LOG_DEBUG, fmt, ##__VA_ARGS__)
+
+#define XLOG(level, fmt, ...)                 \
+    LOG(level, "%s:%d ", __FILE__, __LINE__); \
     LOG(level, fmt, ##__VA_ARGS__)
diff --git a/source/utility/math.c b/source/utility/math.c
index 1d7ab8c9d..117d343fa 100644
--- a/source/utility/math.c
+++ b/source/utility/math.c
@@ -27,38 +27,32 @@
 
 #include <stdlib.h>
 
-
 int imin(int a, int b)
 {
     return a <= b ? a : b;
 }
 
-
 int imax(int a, int b)
 {
     return a >= b ? a : b;
 }
 
-
 int min_abs(int a, int b)
 {
     return imin(abs(a), abs(b));
 }
 
-
 int max_abs(int a, int b)
 {
     return imax(abs(a), abs(b));
 }
 
-
 static int solve_gcd(int large, int small)
 {
     int val = large % small;
     return 0 == val ? small : gcd(small, val);
 }
 
-
 int gcd(int a, int b)
 {
     if (0 == a || 0 == b)
@@ -67,7 +61,6 @@ int gcd(int a, int b)
     return solve_gcd(max_abs(a, b), min_abs(a, b));
 }
 
-
 int lcm(int a, int b)
 {
     if (0 == a || 0 == b)
@@ -76,14 +69,12 @@ int lcm(int a, int b)
     return abs(a * b) / solve_gcd(max_abs(a, b), min_abs(a, b));
 }
 
-
 int align(int value, int step)
 {
     const int mask = ~(abs(step) - 1);
     return (value + step) & mask;
 }
 
-
 void* align_address(void* address, int step)
 {
     const size_t mask = ~(abs(step) - 1);
diff --git a/source/utility/math.h b/source/utility/math.h
index 672ddcdc1..16a7c5d9d 100644
--- a/source/utility/math.h
+++ b/source/utility/math.h
@@ -25,7 +25,6 @@
 
 #pragma once
 
-
 /*!
  * @brief  Solve min value
  *
@@ -36,7 +35,6 @@
  */
 int imin(int a, int b);
 
-
 /*!
  * @brief  Solve max value
  *
@@ -47,7 +45,6 @@ int imin(int a, int b);
  */
 int imax(int a, int b);
 
-
 /*!
  * @brief  Solve min absolute value
  *
@@ -58,7 +55,6 @@ int imax(int a, int b);
  */
 int min_abs(int a, int b);
 
-
 /*!
  * @brief  Solve max absolute value
  *
@@ -69,7 +65,6 @@ int min_abs(int a, int b);
  */
 int max_abs(int a, int b);
 
-
 /*!
  * @brief  Solve greatest common divisor
  *
@@ -80,7 +75,6 @@ int max_abs(int a, int b);
  */
 int gcd(int a, int b);
 
-
 /*!
  * @brief  Solve lowest common multiple
  *
@@ -91,7 +85,6 @@ int gcd(int a, int b);
  */
 int lcm(int a, int b);
 
-
 /*!
  * @brief  Solve min aligned value with the step length
  *
@@ -102,7 +95,6 @@ int lcm(int a, int b);
  */
 int align(int value, int step);
 
-
 /*!
  * @brief  Get aligned pointer
  *
diff --git a/source/utility/mem_stat.c b/source/utility/mem_stat.c
index c1106bba6..44e2212f5 100644
--- a/source/utility/mem_stat.c
+++ b/source/utility/mem_stat.c
@@ -69,7 +69,7 @@ static int find_block_list(void* ptr)
 
     for (i = 0; i < n; i++)
     {
-        struct block_stat* block_stat = ( struct block_stat* )get_vector_data(block_list, i);
+        struct block_stat* block_stat = (struct block_stat*)get_vector_data(block_list, i);
 
         if (block_stat->ptr == ptr)
             break;
@@ -178,7 +178,7 @@ void stat_free(void* ptr)
         return;
     }
 
-    struct block_stat* block_stat = ( struct block_stat* )get_vector_data(block_list, idx);
+    struct block_stat* block_stat = (struct block_stat*)get_vector_data(block_list, idx);
 
     mem_stat.free_count++;
     mem_stat.cur_mem_size -= block_stat->size;
@@ -204,7 +204,7 @@ void* stat_realloc(void* ptr, size_t size)
 
     void* new_ptr = realloc(ptr, size);
 
-    struct block_stat* block_stat = ( struct block_stat* )get_vector_data(block_list, idx);
+    struct block_stat* block_stat = (struct block_stat*)get_vector_data(block_list, idx);
 
     if (new_ptr == NULL)
     {
diff --git a/source/utility/sys_port.c b/source/utility/sys_port.c
index 783009568..a2887a929 100644
--- a/source/utility/sys_port.c
+++ b/source/utility/sys_port.c
@@ -82,7 +82,7 @@ char* strdup(const char* src)
 
     int n = strlen(src);
 
-    char* new_str = ( char* )sys_malloc(n + 1);
+    char* new_str = (char*)sys_malloc(n + 1);
 
     if (new_str == NULL)
         return NULL;
diff --git a/source/utility/sys_port.h b/source/utility/sys_port.h
index 83f4a5f72..151043663 100644
--- a/source/utility/sys_port.h
+++ b/source/utility/sys_port.h
@@ -52,8 +52,8 @@ void* sys_realloc(void* ptr, size_t size);
 
 #ifdef CONFIG_INTERN_ALLOCATOR
 
-#define malloc buddy_malloc
-#define free buddy_free
+#define malloc  buddy_malloc
+#define free    buddy_free
 #define realloc buddy_realloc
 
 void* buddy_malloc(size_t size);
diff --git a/source/utility/utils.c b/source/utility/utils.c
index d3e86838b..8079d1336 100644
--- a/source/utility/utils.c
+++ b/source/utility/utils.c
@@ -23,7 +23,6 @@
  * Revised: lswang@openailab.com
  */
 
-
 #include "utility/utils.h"
 
 #include "defines.h"
@@ -35,25 +34,23 @@
 #include <stdio.h>
 #include <string.h>
 
-
 const char* get_tensor_type_string(int tensor_type)
 {
     switch (tensor_type)
     {
-        case TENSOR_TYPE_VAR:
-            return "var";
-        case TENSOR_TYPE_CONST:
-            return "const";
-        case TENSOR_TYPE_INPUT:
-            return "input";
-        case TENSOR_TYPE_DEP:
-            return "dep";
-        default:
-            return "unknown";
+    case TENSOR_TYPE_VAR:
+        return "var";
+    case TENSOR_TYPE_CONST:
+        return "const";
+    case TENSOR_TYPE_INPUT:
+        return "input";
+    case TENSOR_TYPE_DEP:
+        return "dep";
+    default:
+        return "unknown";
     }
 }
 
-
 const char* get_tensor_layout_string(int layout)
 {
     if (layout == TENGINE_LAYOUT_NHWC)
@@ -62,31 +59,29 @@ const char* get_tensor_layout_string(int layout)
         return "NCHW";
 }
 
-
 const char* get_model_format_string(int model_format)
 {
     switch (model_format)
     {
-        case MODEL_FORMAT_TENGINE:
-            return "tengine";
-        case MODEL_FORMAT_CAFFE:
-            return "caffe";
-        case MODEL_FORMAT_ONNX:
-            return "onnx";
-        case MODEL_FORMAT_MXNET:
-            return "mxnet";
-        case MODEL_FORMAT_TENSORFLOW:
-            return "tensorflow";
-        case MODEL_FORMAT_TFLITE:
-            return "tflite";
-        case MODEL_FORMAT_DLA:
-            return "dla";
-        default:
-            return "unknown";
+    case MODEL_FORMAT_TENGINE:
+        return "tengine";
+    case MODEL_FORMAT_CAFFE:
+        return "caffe";
+    case MODEL_FORMAT_ONNX:
+        return "onnx";
+    case MODEL_FORMAT_MXNET:
+        return "mxnet";
+    case MODEL_FORMAT_TENSORFLOW:
+        return "tensorflow";
+    case MODEL_FORMAT_TFLITE:
+        return "tflite";
+    case MODEL_FORMAT_DLA:
+        return "dla";
+    default:
+        return "unknown";
     }
 }
 
-
 int get_op_type_from_name(const char* name)
 {
     int count = get_op_method_count();
@@ -104,68 +99,63 @@ int get_op_type_from_name(const char* name)
     return -1;
 }
 
-
 const char* get_op_name_from_type(int op_type)
 {
     return find_op_name(op_type);
 }
 
-
 int get_tenser_element_size(int data_type)
 {
     switch (data_type)
     {
-        case TENGINE_DT_FP32:
-        case TENGINE_DT_INT32:
-            return 4;
-        case TENGINE_DT_FP16:
-        case TENGINE_DT_INT16:
-            return 2;
-        case TENGINE_DT_INT8:
-        case TENGINE_DT_UINT8:
-            return 1;
-        default:
-            return 0;
+    case TENGINE_DT_FP32:
+    case TENGINE_DT_INT32:
+        return 4;
+    case TENGINE_DT_FP16:
+    case TENGINE_DT_INT16:
+        return 2;
+    case TENGINE_DT_INT8:
+    case TENGINE_DT_UINT8:
+        return 1;
+    default:
+        return 0;
     }
 }
 
-
 const char* get_tensor_data_type_string(int data_type)
 {
     switch (data_type)
     {
-        case TENGINE_DT_FP32:
-            return "fp32";
-        case TENGINE_DT_FP16:
-            return "fp16";
-        case TENGINE_DT_INT8:
-            return "int8";
-        case TENGINE_DT_UINT8:
-            return "uint8";
-        case TENGINE_DT_INT32:
-            return "int32";
-        case TENGINE_DT_INT16:
-            return "int16";
-        default:
-            return "unknown";
+    case TENGINE_DT_FP32:
+        return "fp32";
+    case TENGINE_DT_FP16:
+        return "fp16";
+    case TENGINE_DT_INT8:
+        return "int8";
+    case TENGINE_DT_UINT8:
+        return "uint8";
+    case TENGINE_DT_INT32:
+        return "int32";
+    case TENGINE_DT_INT16:
+        return "int16";
+    default:
+        return "unknown";
     }
 }
 
-
 const char* data_type_typeinfo_name(int data_type)
 {
     switch (data_type)
     {
-        case TENGINE_DT_INT32:
-            return "i";
-        case TENGINE_DT_FP32:
-            return "f";
-        default:
-            return NULL;
+    case TENGINE_DT_INT32:
+        return "i";
+    case TENGINE_DT_FP32:
+        return "f";
+    default:
+        return NULL;
     }
 }
 
-
 void dump_float(const char* file_name, float* data, int number)
 {
     FILE* fp = fopen(file_name, "w");
@@ -184,7 +174,6 @@ void dump_float(const char* file_name, float* data, int number)
     fclose(fp);
 }
 
-
 int get_mask_count(size_t mask)
 {
     int count = 0;
@@ -196,7 +185,6 @@ int get_mask_count(size_t mask)
     return count;
 }
 
-
 int get_mask_index(size_t mask)
 {
     if (get_mask_count(mask) > 1)
diff --git a/source/utility/utils.h b/source/utility/utils.h
index c52ccfdbd..59ad75aff 100644
--- a/source/utility/utils.h
+++ b/source/utility/utils.h
@@ -27,7 +27,6 @@
 
 #include <stddef.h>
 
-
 /*!
  * @brief Convert tensor type to char array.
  *
@@ -37,7 +36,6 @@
  */
 const char* get_tensor_type_string(int tensor_type);
 
-
 /*!
  * @brief Convert tensor layout to char array.
  *
@@ -47,7 +45,6 @@ const char* get_tensor_type_string(int tensor_type);
  */
 const char* get_tensor_layout_string(int tensor_layout);
 
-
 /*!
  * @brief Convert model format to char array.
  *
@@ -57,7 +54,6 @@ const char* get_tensor_layout_string(int tensor_layout);
  */
 const char* get_model_format_string(int model_format);
 
-
 /*!
  * @brief Convert operator name char array to enumeration value.
  *
@@ -67,7 +63,6 @@ const char* get_model_format_string(int model_format);
  */
 int get_op_type_from_name(const char* name);
 
-
 /*!
  * @brief Convert operator enumeration value to char array.
  *
@@ -77,7 +72,6 @@ int get_op_type_from_name(const char* name);
  */
 const char* get_op_name_from_type(int op_type);
 
-
 /*!
  * @brief Get single element size of the tensor data type.
  *
@@ -87,7 +81,6 @@ const char* get_op_name_from_type(int op_type);
  */
 int get_tenser_element_size(int data_type);
 
-
 /*!
  * @brief Convert tensor data type to char array.
  *
@@ -97,7 +90,6 @@ int get_tenser_element_size(int data_type);
  */
 const char* get_tensor_data_type_string(int data_type);
 
-
 /*!
  * @brief Convert tensor data type single letter char array.
  *
@@ -107,11 +99,8 @@ const char* get_tensor_data_type_string(int data_type);
  */
 const char* data_type_typeinfo_name(int data_type);
 
-
 void dump_float(const char* file_name, float* data, int number);
 
-
 int get_mask_count(size_t mask);
 
-
 int get_mask_index(size_t mask);
diff --git a/source/utility/vector.c b/source/utility/vector.c
index be8b1f01d..887d4d385 100644
--- a/source/utility/vector.c
+++ b/source/utility/vector.c
@@ -31,25 +31,22 @@
 
 #include <string.h>
 
-
 typedef struct vector_entry
 {
     int valid;
     unsigned char data[];
 } vector_entry_t;
 
-
 static inline vector_entry_t* get_vector_entry(vector_t* v, int idx)
 {
     return (vector_entry_t*)((char*)v->mem + v->entry_size * idx);
 }
 
-
 static inline void free_vector_data_resource(vector_t* v, int idx)
 {
     vector_entry_t* e = get_vector_entry(v, idx);
 
-    if(e->valid && v->free_func)
+    if (e->valid && v->free_func)
     {
         v->free_func(e->data);
     }
@@ -57,7 +54,6 @@ static inline void free_vector_data_resource(vector_t* v, int idx)
     e->valid = 0;
 }
 
-
 static inline void remove_vector_data_not_tail(vector_t* v, int idx)
 {
     vector_entry_t* entry_ptr = NULL;
@@ -78,7 +74,6 @@ static inline void remove_vector_data_not_tail(vector_t* v, int idx)
     entry_ptr->valid = 0;
 }
 
-
 vector_t* create_vector(int elem_size, void (*free_data)(void*))
 {
     vector_t* v = (vector_t*)sys_malloc(sizeof(vector_t));
@@ -109,7 +104,6 @@ vector_t* create_vector(int elem_size, void (*free_data)(void*))
     return v;
 }
 
-
 void release_vector(vector_t* v)
 {
     for (int i = 0; i < v->elem_num; i++)
@@ -121,7 +115,6 @@ void release_vector(vector_t* v)
     sys_free(v);
 }
 
-
 int get_vector_num(vector_t* v)
 {
     if (NULL != v)
@@ -132,7 +125,6 @@ int get_vector_num(vector_t* v)
     return 0;
 }
 
-
 int resize_vector(vector_t* v, int new_size)
 {
     void* new_mem = NULL;
@@ -162,7 +154,7 @@ int resize_vector(vector_t* v, int new_size)
     }
 
     v->real_mem = new_mem;
-    v->mem = ( void* )(((size_t)(v->real_mem)) & (~(TE_VECTOR_ALIGN_SIZE - 1)));
+    v->mem = (void*)(((size_t)(v->real_mem)) & (~(TE_VECTOR_ALIGN_SIZE - 1)));
 
     for (int i = v->space_num; i < new_size; i++)
     {
@@ -175,10 +167,9 @@ int resize_vector(vector_t* v, int new_size)
     return 0;
 }
 
-
 int push_vector_data(vector_t* v, void* data)
 {
-    if(v->elem_num == v->space_num && resize_vector(v, v->elem_num + v->ahead_num) < 0)
+    if (v->elem_num == v->space_num && resize_vector(v, v->elem_num + v->ahead_num) < 0)
     {
         return -1;
     }
@@ -189,12 +180,11 @@ int push_vector_data(vector_t* v, void* data)
     return 0;
 }
 
-
 int set_vector_data(vector_t* v, int idx, void* data)
 {
     vector_entry_t* e = NULL;
 
-    if(idx >= v->elem_num)
+    if (idx >= v->elem_num)
         return -1;
 
     free_vector_data_resource(v, idx);
@@ -207,10 +197,9 @@ int set_vector_data(vector_t* v, int idx, void* data)
     return 0;
 }
 
-
 void* get_vector_data(vector_t* v, int index)
 {
-    if(index >= v->elem_num)
+    if (index >= v->elem_num)
     {
         return NULL;
     }
@@ -220,7 +209,6 @@ void* get_vector_data(vector_t* v, int index)
     return e->data;
 }
 
-
 int remove_vector_via_pointer(vector_t* v, void* data)
 {
     const int count = v->elem_num;
@@ -245,11 +233,10 @@ int remove_vector_via_pointer(vector_t* v, void* data)
     return 0;
 }
 
-
 void remove_vector_via_index(vector_t* v, int idx)
 {
     // the last one
-    if(idx == v->elem_num - 1)
+    if (idx == v->elem_num - 1)
     {
         free_vector_data_resource(v, idx);
         v->elem_num--;
diff --git a/source/utility/vector.h b/source/utility/vector.h
index 73153b3f1..45ab70f03 100644
--- a/source/utility/vector.h
+++ b/source/utility/vector.h
@@ -35,18 +35,17 @@ extern "C" {
  */
 typedef struct vector
 {
-    int elem_size;                  //!< elements size which will be pushed into vector
-    int elem_num;                   //!< current counter of inserted elements
-
-    int entry_size;                 //!< size of inside vector header entry
-    int space_num;                  //!< the allocated elements counter, which should greater equal to 'elem_num'
-    int ahead_num;                  //!< allocated step when vector is full
-    void* real_mem;                 //!< real aligned memory address which point to vector entry
-    void* mem;                      //!< visual aligned address which point to the very begging of elements
-    void (*free_func)(void*);       //!< elements free function, will be called when release elements or vector
+    int elem_size; //!< elements size which will be pushed into vector
+    int elem_num;  //!< current counter of inserted elements
+
+    int entry_size;           //!< size of inside vector header entry
+    int space_num;            //!< the allocated elements counter, which should greater equal to 'elem_num'
+    int ahead_num;            //!< allocated step when vector is full
+    void* real_mem;           //!< real aligned memory address which point to vector entry
+    void* mem;                //!< visual aligned address which point to the very begging of elements
+    void (*free_func)(void*); //!< elements free function, will be called when release elements or vector
 } vector_t;
 
-
 /*!
  * @brief  Create a vector for a struct(or something else).
  *
@@ -59,7 +58,6 @@ typedef struct vector
  */
 vector_t* create_vector(int elem_size, void (*free_func)(void*));
 
-
 /*!
  * @brief  Release a vector.
  *
@@ -67,7 +65,6 @@ vector_t* create_vector(int elem_size, void (*free_func)(void*));
  */
 void release_vector(vector_t* v);
 
-
 /*!
  * @brief Get the count of elements.
  *
@@ -77,7 +74,6 @@ void release_vector(vector_t* v);
  */
 int get_vector_num(vector_t* v);
 
-
 /*!
  * @brief  Resize a vector.
  *
@@ -88,7 +84,6 @@ int get_vector_num(vector_t* v);
  */
 int resize_vector(vector_t* v, int new_size);
 
-
 /*!
  * @brief Push a element into vector from its pointer.
  *
@@ -99,7 +94,6 @@ int resize_vector(vector_t* v, int new_size);
  */
 int push_vector_data(vector_t* v, void* data);
 
-
 /*!
  * @brief Set a element via its index.
  *
@@ -111,7 +105,6 @@ int push_vector_data(vector_t* v, void* data);
  */
 int set_vector_data(vector_t* v, int index, void* data);
 
-
 /*!
  * @brief Get a element via its index.
  *
@@ -122,7 +115,6 @@ int set_vector_data(vector_t* v, int index, void* data);
  */
 void* get_vector_data(vector_t* v, int index);
 
-
 /*!
  * @brief Remove a element via its pointer.
  *
@@ -133,7 +125,6 @@ void* get_vector_data(vector_t* v, int index);
  */
 int remove_vector_via_pointer(vector_t* v, void* data);
 
-
 /*!
  * @brief Remove a element via its index.
  *
diff --git a/tests/common/common.h b/tests/common/common.h
index 40a263aba..9ab861855 100644
--- a/tests/common/common.h
+++ b/tests/common/common.h
@@ -42,9 +42,9 @@
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
-#else    // _WIN32
+#else // _WIN32
 #include <sys/time.h>
-#endif    // _WIN32
+#endif // _WIN32
 
 #ifdef _WIN32
 static double get_current_time()
@@ -56,7 +56,7 @@ static double get_current_time()
 
     return pc.QuadPart * 1000.0 / freq.QuadPart;
 }
-#else    // _WIN32
+#else  // _WIN32
 
 static double get_current_time()
 {
@@ -65,7 +65,7 @@ static double get_current_time()
 
     return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
 }
-#endif    // _WIN32
+#endif // _WIN32
 
 static void split(float* array, char* str, const char* del)
 {
@@ -78,4 +78,4 @@ static void split(float* array, char* str, const char* del)
     }
 }
 
-#endif    // __COMMON_H__
+#endif // __COMMON_H__
diff --git a/tests/common/compiler_fp16.h b/tests/common/compiler_fp16.h
index 1857d7eec..d770707c2 100644
--- a/tests/common/compiler_fp16.h
+++ b/tests/common/compiler_fp16.h
@@ -48,7 +48,7 @@ extern "C" {
 
 #else
 #ifdef _MSC_VER
-#pragma  pack (push,1)
+#pragma pack(push, 1)
 struct fp16_pack
 {
     unsigned short frac : 10;
@@ -84,12 +84,12 @@ typedef struct fp16_pack __fp16;
 static inline float fp16_to_fp32(__fp16 data)
 {
     float f;
-    struct fp32_pack* fp32 = ( struct fp32_pack* )&f;
+    struct fp32_pack* fp32 = (struct fp32_pack*)&f;
     struct fp16_pack* fp16 = &data;
 
     int exp = fp16->exp;
 
-    if(exp == 31 && fp16->frac != 0)
+    if (exp == 31 && fp16->frac != 0)
     {
         // return __builtin_inf()-__builtin_inf();
         fp32->sign = fp16->sign;
@@ -99,28 +99,28 @@ static inline float fp16_to_fp32(__fp16 data)
         return f;
     }
 
-    if(exp == 31)
+    if (exp == 31)
         exp = 255;
-    if(exp == 0)
+    if (exp == 0)
         exp = 0;
     else
         exp = (exp - 15) + 127;
 
     fp32->exp = exp;
     fp32->sign = fp16->sign;
-    fp32->frac = (( int )fp16->frac) << 13;
+    fp32->frac = ((int)fp16->frac) << 13;
 
     return f;
 }
 
 static inline __fp16 fp32_to_fp16(float data)
 {
-    struct fp32_pack* fp32 = ( struct fp32_pack* )&data;
+    struct fp32_pack* fp32 = (struct fp32_pack*)&data;
     struct fp16_pack fp16;
 
     int exp = fp32->exp;
 
-    if(fp32->exp == 255 && fp32->frac != 0)
+    if (fp32->exp == 255 && fp32->frac != 0)
     {
         // NaN
         fp16.exp = 31;
@@ -130,9 +130,9 @@ static inline __fp16 fp32_to_fp16(float data)
         return fp16;
     }
 
-    if((exp - 127) < -14)
+    if ((exp - 127) < -14)
         exp = 0;
-    else if((exp - 127) > 15)
+    else if ((exp - 127) > 15)
         exp = 31;
     else
         exp = exp - 127 + 15;
diff --git a/tests/common/stb_image.h b/tests/common/stb_image.h
index aa445aadf..142610cf4 100644
--- a/tests/common/stb_image.h
+++ b/tests/common/stb_image.h
@@ -3,13 +3,13 @@
 
 #ifndef STBI_NO_STDIO
 #include <stdio.h>
-#endif    // STBI_NO_STDIO
+#endif // STBI_NO_STDIO
 
 #define STBI_VERSION 1
 
 enum
 {
-    STBI_default = 0,    // only used for desired_channels
+    STBI_default = 0, // only used for desired_channels
 
     STBI_grey = 1,
     STBI_grey_alpha = 2,
@@ -36,9 +36,9 @@ extern "C" {
 typedef struct
 {
     int (*read)(void* user, char* data,
-                int size);    // fill 'data' with 'size' bytes.  return number of bytes actually read
-    void (*skip)(void* user, int n);    // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
-    int (*eof)(void* user);    // returns nonzero if we are at end of file/data
+                int size);           // fill 'data' with 'size' bytes.  return number of bytes actually read
+    void (*skip)(void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+    int (*eof)(void* user);          // returns nonzero if we are at end of file/data
 } stbi_io_callbacks;
 
 ////////////////////////////////////
@@ -95,12 +95,12 @@ extern float* stbi_loadf_from_file(FILE* f, int* x, int* y, int* channels_in_fil
 #ifndef STBI_NO_HDR
 extern void stbi_hdr_to_ldr_gamma(float gamma);
 extern void stbi_hdr_to_ldr_scale(float scale);
-#endif    // STBI_NO_HDR
+#endif // STBI_NO_HDR
 
 #ifndef STBI_NO_LINEAR
 extern void stbi_ldr_to_hdr_gamma(float gamma);
 extern void stbi_ldr_to_hdr_scale(float scale);
-#endif    // STBI_NO_LINEAR
+#endif // STBI_NO_LINEAR
 
 // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
 extern int stbi_is_hdr_from_callbacks(stbi_io_callbacks const* clbk, void* user);
@@ -108,7 +108,7 @@ extern int stbi_is_hdr_from_memory(stbi_uc const* buffer, int len);
 #ifndef STBI_NO_STDIO
 extern int stbi_is_hdr(char const* filename);
 extern int stbi_is_hdr_from_file(FILE* f);
-#endif    // STBI_NO_STDIO
+#endif // STBI_NO_STDIO
 
 // get a VERY brief reason for failure
 // NOT THREADSAFE
@@ -160,14 +160,12 @@ extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char*
 //
 //
 ////   end header file   /////////////////////////////////////////////////////
-#endif    // STBI_INCLUDE_STB_IMAGE_H
+#endif // STBI_INCLUDE_STB_IMAGE_H
 
 #define STB_IMAGE_IMPLEMENTATION
 #ifdef STB_IMAGE_IMPLEMENTATION
 
-#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || \
-    defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) ||  \
-    defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB)
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB)
 #ifndef STBI_ONLY_JPEG
 #define STBI_NO_JPEG
 #endif
@@ -202,13 +200,13 @@ extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char*
 #endif
 
 #include <stdarg.h>
-#include <stddef.h>    // ptrdiff_t on osx
+#include <stddef.h> // ptrdiff_t on osx
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>    // ldexp, pow
+#include <math.h> // ldexp, pow
 #endif
 
 #ifndef STBI_NO_STDIO
@@ -247,9 +245,9 @@ typedef int32_t stbi__int32;
 typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 
 #ifdef _MSC_VER
-#define STBI_NOTUSED(v) ( void )(v)
+#define STBI_NOTUSED(v) (void)(v)
 #else
-#define STBI_NOTUSED(v) ( void )sizeof(v)
+#define STBI_NOTUSED(v) (void)sizeof(v)
 #endif
 
 #ifdef _MSC_VER
@@ -271,9 +269,9 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 #endif
 
 #ifndef STBI_MALLOC
-#define STBI_MALLOC(sz) malloc(sz)
+#define STBI_MALLOC(sz)        malloc(sz)
 #define STBI_REALLOC(p, newsz) realloc(p, newsz)
-#define STBI_FREE(p) free(p)
+#define STBI_FREE(p)           free(p)
 #endif
 
 #ifndef STBI_REALLOC_SIZED
@@ -319,8 +317,8 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 
 #ifdef _MSC_VER
 
-#if _MSC_VER >= 1400    // not VC6
-#include <intrin.h>    // __cpuid
+#if _MSC_VER >= 1400 // not VC6
+#include <intrin.h>  // __cpuid
 static int stbi__cpuid3(void)
 {
     int info[4];
@@ -347,7 +345,7 @@ static int stbi__sse2_available(void)
     int info3 = stbi__cpuid3();
     return ((info3 >> 26) & 1) != 0;
 }
-#else    // assume GCC-style if not VC++
+#else // assume GCC-style if not VC++
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 
 static int stbi__sse2_available(void)
@@ -404,8 +402,8 @@ static void stbi__start_mem(stbi__context* s, stbi_uc const* buffer, int len)
 {
     s->io.read = NULL;
     s->read_from_callbacks = 0;
-    s->img_buffer = s->img_buffer_original = ( stbi_uc* )buffer;
-    s->img_buffer_end = s->img_buffer_original_end = ( stbi_uc* )buffer + len;
+    s->img_buffer = s->img_buffer_original = (stbi_uc*)buffer;
+    s->img_buffer_end = s->img_buffer_original_end = (stbi_uc*)buffer + len;
 }
 
 // initialize a callback-based context
@@ -424,17 +422,17 @@ static void stbi__start_callbacks(stbi__context* s, stbi_io_callbacks* c, void*
 
 static int stbi__stdio_read(void* user, char* data, int size)
 {
-    return ( int )fread(data, 1, size, ( FILE* )user);
+    return (int)fread(data, 1, size, (FILE*)user);
 }
 
 static void stbi__stdio_skip(void* user, int n)
 {
-    fseek(( FILE* )user, n, SEEK_CUR);
+    fseek((FILE*)user, n, SEEK_CUR);
 }
 
 static int stbi__stdio_eof(void* user)
 {
-    return feof(( FILE* )user);
+    return feof((FILE*)user);
 }
 
 static stbi_io_callbacks stbi__stdio_callbacks = {
@@ -445,12 +443,12 @@ static stbi_io_callbacks stbi__stdio_callbacks = {
 
 static void stbi__start_file(stbi__context* s, FILE* f)
 {
-    stbi__start_callbacks(s, &stbi__stdio_callbacks, ( void* )f);
+    stbi__start_callbacks(s, &stbi__stdio_callbacks, (void*)f);
 }
 
 // static void stop_file(stbi__context *s) { }
 
-#endif    // !STBI_NO_STDIO
+#endif // !STBI_NO_STDIO
 
 static void stbi__rewind(stbi__context* s)
 {
@@ -564,7 +562,7 @@ static void* stbi__malloc(size_t size)
 // negative terms are considered invalid.
 static int stbi__addsizes_valid(int a, int b)
 {
-    if(b < 0)
+    if (b < 0)
         return 0;
     // now 0 <= b <= INT_MAX, hence also
     // 0 <= INT_MAX - b <= INTMAX.
@@ -577,10 +575,10 @@ static int stbi__addsizes_valid(int a, int b)
 // negative factors are considered invalid.
 static int stbi__mul2sizes_valid(int a, int b)
 {
-    if(a < 0 || b < 0)
+    if (a < 0 || b < 0)
         return 0;
-    if(b == 0)
-        return 1;    // mul-by-0 is always safe
+    if (b == 0)
+        return 1; // mul-by-0 is always safe
     // portable way to check for no overflows in a*b
     return a <= INT_MAX / b;
 }
@@ -601,22 +599,21 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add)
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
 static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
 {
-    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) &&
-           stbi__addsizes_valid(a * b * c * d, add);
+    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) && stbi__addsizes_valid(a * b * c * d, add);
 }
 #endif
 
 // mallocs with size overflow checking
 static void* stbi__malloc_mad2(int a, int b, int add)
 {
-    if(!stbi__mad2sizes_valid(a, b, add))
+    if (!stbi__mad2sizes_valid(a, b, add))
         return NULL;
     return stbi__malloc(a * b + add);
 }
 
 static void* stbi__malloc_mad3(int a, int b, int c, int add)
 {
-    if(!stbi__mad3sizes_valid(a, b, c, add))
+    if (!stbi__mad3sizes_valid(a, b, c, add))
         return NULL;
     return stbi__malloc(a * b * c + add);
 }
@@ -624,7 +621,7 @@ static void* stbi__malloc_mad3(int a, int b, int c, int add)
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
 static void* stbi__malloc_mad4(int a, int b, int c, int d, int add)
 {
-    if(!stbi__mad4sizes_valid(a, b, c, d, add))
+    if (!stbi__mad4sizes_valid(a, b, c, d, add))
         return NULL;
     return stbi__malloc(a * b * c * d + add);
 }
@@ -642,8 +639,8 @@ static void* stbi__malloc_mad4(int a, int b, int c, int d, int add)
 #define stbi__err(x, y) stbi__err(x)
 #endif
 
-#define stbi__errpf(x, y) (( float* )(size_t)(stbi__err(x, y) ? NULL : NULL))
-#define stbi__errpuc(x, y) (( unsigned char* )(size_t)(stbi__err(x, y) ? NULL : NULL))
+#define stbi__errpf(x, y)  ((float*)(size_t)(stbi__err(x, y) ? NULL : NULL))
+#define stbi__errpuc(x, y) ((unsigned char*)(size_t)(stbi__err(x, y) ? NULL : NULL))
 
 extern void stbi_image_free(void* retval_from_stbi_load)
 {
@@ -667,43 +664,42 @@ extern void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
 
 static void* stbi__load_main(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri, int bpc)
 {
-    memset(ri, 0, sizeof(*ri));    // make sure it's initialized if we add new fields
-    ri->bits_per_channel = 8;    // default is 8 so most paths don't have to be changed
-    ri->channel_order =
-        STBI_ORDER_RGB;    // all current input & output are this, but this is here so we can add BGR order
+    memset(ri, 0, sizeof(*ri));         // make sure it's initialized if we add new fields
+    ri->bits_per_channel = 8;           // default is 8 so most paths don't have to be changed
+    ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
     ri->num_channels = 0;
 
 #ifndef STBI_NO_JPEG
-    if(stbi__jpeg_test(s))
+    if (stbi__jpeg_test(s))
         return stbi__jpeg_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_PNG
-    if(stbi__png_test(s))
+    if (stbi__png_test(s))
         return stbi__png_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_BMP
-    if(stbi__bmp_test(s))
+    if (stbi__bmp_test(s))
         return stbi__bmp_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_GIF
-    if(stbi__gif_test(s))
+    if (stbi__gif_test(s))
         return stbi__gif_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_PSD
-    if(stbi__psd_test(s))
+    if (stbi__psd_test(s))
         return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc);
 #endif
 #ifndef STBI_NO_PIC
-    if(stbi__pic_test(s))
+    if (stbi__pic_test(s))
         return stbi__pic_load(s, x, y, comp, req_comp, ri);
 #endif
 #ifndef STBI_NO_PNM
-    if(stbi__pnm_test(s))
+    if (stbi__pnm_test(s))
         return stbi__pnm_load(s, x, y, comp, req_comp, ri);
 #endif
 
 #ifndef STBI_NO_HDR
-    if(stbi__hdr_test(s))
+    if (stbi__hdr_test(s))
     {
         float* hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri);
         return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
@@ -712,7 +708,7 @@ static void* stbi__load_main(stbi__context* s, int* x, int* y, int* comp, int re
 
 #ifndef STBI_NO_TGA
     // test tga last because it's a crappy test!
-    if(stbi__tga_test(s))
+    if (stbi__tga_test(s))
         return stbi__tga_load(s, x, y, comp, req_comp, ri);
 #endif
 
@@ -725,13 +721,12 @@ static stbi_uc* stbi__convert_16_to_8(stbi__uint16* orig, int w, int h, int chan
     int img_len = w * h * channels;
     stbi_uc* reduced;
 
-    reduced = ( stbi_uc* )stbi__malloc(img_len);
-    if(reduced == NULL)
+    reduced = (stbi_uc*)stbi__malloc(img_len);
+    if (reduced == NULL)
         return stbi__errpuc("outofmem", "Out of memory");
 
-    for(i = 0; i < img_len; ++i)
-        reduced[i] =
-            (stbi_uc)((orig[i] >> 8) & 0xFF);    // top half of each byte is sufficient approx of 16->8 bit scaling
+    for (i = 0; i < img_len; ++i)
+        reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
 
     STBI_FREE(orig);
     return reduced;
@@ -743,13 +738,12 @@ static stbi__uint16* stbi__convert_8_to_16(stbi_uc* orig, int w, int h, int chan
     int img_len = w * h * channels;
     stbi__uint16* enlarged;
 
-    enlarged = ( stbi__uint16* )stbi__malloc(img_len * 2);
-    if(enlarged == NULL)
-        return ( stbi__uint16* )stbi__errpuc("outofmem", "Out of memory");
+    enlarged = (stbi__uint16*)stbi__malloc(img_len * 2);
+    if (enlarged == NULL)
+        return (stbi__uint16*)stbi__errpuc("outofmem", "Out of memory");
 
-    for(i = 0; i < img_len; ++i)
-        enlarged[i] =
-            (stbi__uint16)((orig[i] << 8) + orig[i]);    // replicate to high and low byte, maps 0->0, 255->0xffff
+    for (i = 0; i < img_len; ++i)
+        enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
 
     STBI_FREE(orig);
     return enlarged;
@@ -758,17 +752,17 @@ static stbi__uint16* stbi__convert_8_to_16(stbi_uc* orig, int w, int h, int chan
 static void stbi__vertical_flip(void* image, int w, int h, int bytes_per_pixel)
 {
     int row;
-    size_t bytes_per_row = ( size_t )w * bytes_per_pixel;
+    size_t bytes_per_row = (size_t)w * bytes_per_pixel;
     stbi_uc temp[2048];
-    stbi_uc* bytes = ( stbi_uc* )image;
+    stbi_uc* bytes = (stbi_uc*)image;
 
-    for(row = 0; row < (h >> 1); row++)
+    for (row = 0; row < (h >> 1); row++)
     {
         stbi_uc* row0 = bytes + row * bytes_per_row;
         stbi_uc* row1 = bytes + (h - row - 1) * bytes_per_row;
         // swap row0 with row1
         size_t bytes_left = bytes_per_row;
-        while(bytes_left)
+        while (bytes_left)
         {
             size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
             memcpy(temp, row0, bytes_copy);
@@ -786,8 +780,8 @@ static void stbi__vertical_flip_slices(void* image, int w, int h, int z, int byt
     int slice;
     int slice_size = w * h * bytes_per_pixel;
 
-    stbi_uc* bytes = ( stbi_uc* )image;
-    for(slice = 0; slice < z; ++slice)
+    stbi_uc* bytes = (stbi_uc*)image;
+    for (slice = 0; slice < z; ++slice)
     {
         stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
         bytes += slice_size;
@@ -799,25 +793,25 @@ static unsigned char* stbi__load_and_postprocess_8bit(stbi__context* s, int* x,
     stbi__result_info ri;
     void* result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
 
-    if(result == NULL)
+    if (result == NULL)
         return NULL;
 
-    if(ri.bits_per_channel != 8)
+    if (ri.bits_per_channel != 8)
     {
         STBI_ASSERT(ri.bits_per_channel == 16);
-        result = stbi__convert_16_to_8(( stbi__uint16* )result, *x, *y, req_comp == 0 ? *comp : req_comp);
+        result = stbi__convert_16_to_8((stbi__uint16*)result, *x, *y, req_comp == 0 ? *comp : req_comp);
         ri.bits_per_channel = 8;
     }
 
     // @TODO: move stbi__convert_format to here
 
-    if(stbi__vertically_flip_on_load)
+    if (stbi__vertically_flip_on_load)
     {
         int channels = req_comp ? req_comp : *comp;
         stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
     }
 
-    return ( unsigned char* )result;
+    return (unsigned char*)result;
 }
 
 static stbi__uint16* stbi__load_and_postprocess_16bit(stbi__context* s, int* x, int* y, int* comp, int req_comp)
@@ -825,32 +819,32 @@ static stbi__uint16* stbi__load_and_postprocess_16bit(stbi__context* s, int* x,
     stbi__result_info ri;
     void* result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
 
-    if(result == NULL)
+    if (result == NULL)
         return NULL;
 
-    if(ri.bits_per_channel != 16)
+    if (ri.bits_per_channel != 16)
     {
         STBI_ASSERT(ri.bits_per_channel == 8);
-        result = stbi__convert_8_to_16(( stbi_uc* )result, *x, *y, req_comp == 0 ? *comp : req_comp);
+        result = stbi__convert_8_to_16((stbi_uc*)result, *x, *y, req_comp == 0 ? *comp : req_comp);
         ri.bits_per_channel = 16;
     }
 
     // @TODO: move stbi__convert_format16 to here
     // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
 
-    if(stbi__vertically_flip_on_load)
+    if (stbi__vertically_flip_on_load)
     {
         int channels = req_comp ? req_comp : *comp;
         stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
     }
 
-    return ( stbi__uint16* )result;
+    return (stbi__uint16*)result;
 }
 
 #if !defined(STBI_NO_HDR) || !defined(STBI_NO_LINEAR)
 static void stbi__float_postprocess(float* result, int* x, int* y, int* comp, int req_comp)
 {
-    if(stbi__vertically_flip_on_load && result != NULL)
+    if (stbi__vertically_flip_on_load && result != NULL)
     {
         int channels = req_comp ? req_comp : *comp;
         stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
@@ -864,7 +858,7 @@ static FILE* stbi__fopen(char const* filename, char const* mode)
 {
     FILE* f;
 #if defined(_MSC_VER) && _MSC_VER >= 1400
-    if(0 != fopen_s(&f, filename, mode))
+    if (0 != fopen_s(&f, filename, mode))
         f = 0;
 #else
     f = fopen(filename, mode);
@@ -876,7 +870,7 @@ extern stbi_uc* stbi_load(const char* filename, int* x, int* y, int* comp, int r
 {
     FILE* f = stbi__fopen(filename, "rb");
     unsigned char* result;
-    if(!f)
+    if (!f)
         return stbi__errpuc("can't fopen", "Unable to open file");
     result = stbi_load_from_file(f, x, y, comp, req_comp);
     fclose(f);
@@ -889,10 +883,10 @@ extern stbi_uc* stbi_load_from_file(FILE* f, int* x, int* y, int* comp, int req_
     stbi__context s;
     stbi__start_file(&s, f);
     result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
-    if(result)
+    if (result)
     {
         // need to 'unget' all the characters in the IO buffer
-        fseek(f, -( int )(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+        fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
     }
     return result;
 }
@@ -903,10 +897,10 @@ extern stbi__uint16* stbi_load_from_file_16(FILE* f, int* x, int* y, int* comp,
     stbi__context s;
     stbi__start_file(&s, f);
     result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp);
-    if(result)
+    if (result)
     {
         // need to 'unget' all the characters in the IO buffer
-        fseek(f, -( int )(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+        fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
     }
     return result;
 }
@@ -915,14 +909,14 @@ extern stbi_us* stbi_load_16(char const* filename, int* x, int* y, int* comp, in
 {
     FILE* f = stbi__fopen(filename, "rb");
     stbi__uint16* result;
-    if(!f)
-        return ( stbi_us* )stbi__errpuc("can't fopen", "Unable to open file");
+    if (!f)
+        return (stbi_us*)stbi__errpuc("can't fopen", "Unable to open file");
     result = stbi_load_from_file_16(f, x, y, comp, req_comp);
     fclose(f);
     return result;
 }
 
-#endif    //! STBI_NO_STDIO
+#endif //! STBI_NO_STDIO
 
 extern stbi_us* stbi_load_16_from_memory(stbi_uc const* buffer, int len, int* x, int* y, int* channels_in_file,
                                          int desired_channels)
@@ -936,7 +930,7 @@ extern stbi_us* stbi_load_16_from_callbacks(stbi_io_callbacks const* clbk, void*
                                             int* channels_in_file, int desired_channels)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user);
     return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels);
 }
 
@@ -951,7 +945,7 @@ extern stbi_uc* stbi_load_from_callbacks(stbi_io_callbacks const* clbk, void* us
                                          int req_comp)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user);
     return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
 }
 
@@ -963,8 +957,8 @@ extern stbi_uc* stbi_load_gif_from_memory(stbi_uc const* buffer, int len, int**
     stbi__context s;
     stbi__start_mem(&s, buffer, len);
 
-    result = ( unsigned char* )stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
-    if(stbi__vertically_flip_on_load)
+    result = (unsigned char*)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+    if (stbi__vertically_flip_on_load)
     {
         stbi__vertical_flip_slices(result, *x, *y, *z, *comp);
     }
@@ -978,17 +972,17 @@ static float* stbi__loadf_main(stbi__context* s, int* x, int* y, int* comp, int
 {
     unsigned char* data;
 #ifndef STBI_NO_HDR
-    if(stbi__hdr_test(s))
+    if (stbi__hdr_test(s))
     {
         stbi__result_info ri;
         float* hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri);
-        if(hdr_data)
+        if (hdr_data)
             stbi__float_postprocess(hdr_data, x, y, comp, req_comp);
         return hdr_data;
     }
 #endif
     data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
-    if(data)
+    if (data)
         return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
     return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
 }
@@ -1004,7 +998,7 @@ extern float* stbi_loadf_from_callbacks(stbi_io_callbacks const* clbk, void* use
                                         int req_comp)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user);
     return stbi__loadf_main(&s, x, y, comp, req_comp);
 }
 
@@ -1013,7 +1007,7 @@ extern float* stbi_loadf(char const* filename, int* x, int* y, int* comp, int re
 {
     float* result;
     FILE* f = stbi__fopen(filename, "rb");
-    if(!f)
+    if (!f)
         return stbi__errpf("can't fopen", "Unable to open file");
     result = stbi_loadf_from_file(f, x, y, comp, req_comp);
     fclose(f);
@@ -1026,9 +1020,9 @@ extern float* stbi_loadf_from_file(FILE* f, int* x, int* y, int* comp, int req_c
     stbi__start_file(&s, f);
     return stbi__loadf_main(&s, x, y, comp, req_comp);
 }
-#endif    // !STBI_NO_STDIO
+#endif // !STBI_NO_STDIO
 
-#endif    // !STBI_NO_LINEAR
+#endif // !STBI_NO_LINEAR
 
 // these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
 // defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
@@ -1052,7 +1046,7 @@ extern int stbi_is_hdr(char const* filename)
 {
     FILE* f = stbi__fopen(filename, "rb");
     int result = 0;
-    if(f)
+    if (f)
     {
         result = stbi_is_hdr_from_file(f);
         fclose(f);
@@ -1075,13 +1069,13 @@ extern int stbi_is_hdr_from_file(FILE* f)
     return 0;
 #endif
 }
-#endif    // !STBI_NO_STDIO
+#endif // !STBI_NO_STDIO
 
 extern int stbi_is_hdr_from_callbacks(stbi_io_callbacks const* clbk, void* user)
 {
 #ifndef STBI_NO_HDR
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )clbk, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)clbk, user);
     return stbi__hdr_test(&s);
 #else
     STBI_NOTUSED(clbk);
@@ -1128,8 +1122,8 @@ enum
 
 static void stbi__refill_buffer(stbi__context* s)
 {
-    int n = (s->io.read)(s->io_user_data, ( char* )s->buffer_start, s->buflen);
-    if(n == 0)
+    int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen);
+    if (n == 0)
     {
         // at end of file, treat same as if from memory, but need to handle case
         // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
@@ -1147,9 +1141,9 @@ static void stbi__refill_buffer(stbi__context* s)
 
 stbi_inline static stbi_uc stbi__get8(stbi__context* s)
 {
-    if(s->img_buffer < s->img_buffer_end)
+    if (s->img_buffer < s->img_buffer_end)
         return *s->img_buffer++;
-    if(s->read_from_callbacks)
+    if (s->read_from_callbacks)
     {
         stbi__refill_buffer(s);
         return *s->img_buffer++;
@@ -1159,13 +1153,13 @@ stbi_inline static stbi_uc stbi__get8(stbi__context* s)
 
 stbi_inline static int stbi__at_eof(stbi__context* s)
 {
-    if(s->io.read)
+    if (s->io.read)
     {
-        if(!(s->io.eof)(s->io_user_data))
+        if (!(s->io.eof)(s->io_user_data))
             return 0;
         // if feof() is true, check if buffer = end
         // special case: we've only got the special 0 character at the end
-        if(s->read_from_callbacks == 0)
+        if (s->read_from_callbacks == 0)
             return 1;
     }
 
@@ -1174,15 +1168,15 @@ stbi_inline static int stbi__at_eof(stbi__context* s)
 
 static void stbi__skip(stbi__context* s, int n)
 {
-    if(n < 0)
+    if (n < 0)
     {
         s->img_buffer = s->img_buffer_end;
         return;
     }
-    if(s->io.read)
+    if (s->io.read)
     {
-        int blen = ( int )(s->img_buffer_end - s->img_buffer);
-        if(blen < n)
+        int blen = (int)(s->img_buffer_end - s->img_buffer);
+        if (blen < n)
         {
             s->img_buffer = s->img_buffer_end;
             (s->io.skip)(s->io_user_data, n - blen);
@@ -1194,23 +1188,23 @@ static void stbi__skip(stbi__context* s, int n)
 
 static int stbi__getn(stbi__context* s, stbi_uc* buffer, int n)
 {
-    if(s->io.read)
+    if (s->io.read)
     {
-        int blen = ( int )(s->img_buffer_end - s->img_buffer);
-        if(blen < n)
+        int blen = (int)(s->img_buffer_end - s->img_buffer);
+        if (blen < n)
         {
             int res, count;
 
             memcpy(buffer, s->img_buffer, blen);
 
-            count = (s->io.read)(s->io_user_data, ( char* )buffer + blen, n - blen);
+            count = (s->io.read)(s->io_user_data, (char*)buffer + blen, n - blen);
             res = (count == (n - blen));
             s->img_buffer = s->img_buffer_end;
             return res;
         }
     }
 
-    if(s->img_buffer + n <= s->img_buffer_end)
+    if (s->img_buffer + n <= s->img_buffer_end)
     {
         memcpy(buffer, s->img_buffer, n);
         s->img_buffer += n;
@@ -1250,7 +1244,7 @@ static stbi__uint32 stbi__get32le(stbi__context* s)
 }
 #endif
 
-#define STBI__BYTECAST(x) ((stbi_uc)(( x )&255))    // truncate int to byte without warnings
+#define STBI__BYTECAST(x) ((stbi_uc)((x)&255)) // truncate int to byte without warnings
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -1273,29 +1267,29 @@ static unsigned char* stbi__convert_format(unsigned char* data, int img_n, int r
     int i, j;
     unsigned char* good;
 
-    if(req_comp == img_n)
+    if (req_comp == img_n)
         return data;
     STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-    good = ( unsigned char* )stbi__malloc_mad3(req_comp, x, y, 0);
-    if(good == NULL)
+    good = (unsigned char*)stbi__malloc_mad3(req_comp, x, y, 0);
+    if (good == NULL)
     {
         STBI_FREE(data);
         return stbi__errpuc("outofmem", "Out of memory");
     }
 
-    for(j = 0; j < ( int )y; ++j)
+    for (j = 0; j < (int)y; ++j)
     {
         unsigned char* src = data + j * x * img_n;
         unsigned char* dest = good + j * x * req_comp;
 
-#define STBI__COMBO(a, b) (( a )*8 + (b))
+#define STBI__COMBO(a, b) ((a)*8 + (b))
 #define STBI__CASE(a, b)    \
     case STBI__COMBO(a, b): \
-        for(i = x - 1; i >= 0; --i, src += a, dest += b)
+        for (i = x - 1; i >= 0; --i, src += a, dest += b)
         // convert source image with img_n components to one with req_comp components;
         // avoid switch per pixel, so use switch per scanline and massive macros
-        switch(STBI__COMBO(img_n, req_comp))
+        switch (STBI__COMBO(img_n, req_comp))
         {
             STBI__CASE(1, 2)
             {
@@ -1357,8 +1351,8 @@ static unsigned char* stbi__convert_format(unsigned char* data, int img_n, int r
                 dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
             }
             break;
-            default:
-                STBI_ASSERT(0);
+        default:
+            STBI_ASSERT(0);
         }
 #undef STBI__CASE
     }
@@ -1377,29 +1371,29 @@ static stbi__uint16* stbi__convert_format16(stbi__uint16* data, int img_n, int r
     int i, j;
     stbi__uint16* good;
 
-    if(req_comp == img_n)
+    if (req_comp == img_n)
         return data;
     STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-    good = ( stbi__uint16* )stbi__malloc((size_t)req_comp * x * y * 2);
-    if(good == NULL)
+    good = (stbi__uint16*)stbi__malloc((size_t)req_comp * x * y * 2);
+    if (good == NULL)
     {
         STBI_FREE(data);
-        return ( stbi__uint16* )stbi__errpuc("outofmem", "Out of memory");
+        return (stbi__uint16*)stbi__errpuc("outofmem", "Out of memory");
     }
 
-    for(j = 0; j < ( int )y; ++j)
+    for (j = 0; j < (int)y; ++j)
     {
         stbi__uint16* src = data + j * x * img_n;
         stbi__uint16* dest = good + j * x * req_comp;
 
-#define STBI__COMBO(a, b) (( a )*8 + (b))
+#define STBI__COMBO(a, b) ((a)*8 + (b))
 #define STBI__CASE(a, b)    \
     case STBI__COMBO(a, b): \
-        for(i = x - 1; i >= 0; --i, src += a, dest += b)
+        for (i = x - 1; i >= 0; --i, src += a, dest += b)
         // convert source image with img_n components to one with req_comp components;
         // avoid switch per pixel, so use switch per scanline and massive macros
-        switch(STBI__COMBO(img_n, req_comp))
+        switch (STBI__COMBO(img_n, req_comp))
         {
             STBI__CASE(1, 2)
             {
@@ -1461,8 +1455,8 @@ static stbi__uint16* stbi__convert_format16(stbi__uint16* data, int img_n, int r
                 dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
             }
             break;
-            default:
-                STBI_ASSERT(0);
+        default:
+            STBI_ASSERT(0);
         }
 #undef STBI__CASE
     }
@@ -1476,26 +1470,26 @@ static float* stbi__ldr_to_hdr(stbi_uc* data, int x, int y, int comp)
 {
     int i, k, n;
     float* output;
-    if(!data)
+    if (!data)
         return NULL;
-    output = ( float* )stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
-    if(output == NULL)
+    output = (float*)stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+    if (output == NULL)
     {
         STBI_FREE(data);
         return stbi__errpf("outofmem", "Out of memory");
     }
     // compute number of non-alpha components
-    if(comp & 1)
+    if (comp & 1)
         n = comp;
     else
         n = comp - 1;
-    for(i = 0; i < x * y; ++i)
+    for (i = 0; i < x * y; ++i)
     {
-        for(k = 0; k < n; ++k)
+        for (k = 0; k < n; ++k)
         {
-            output[i * comp + k] = ( float )(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+            output[i * comp + k] = (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
         }
-        if(k < comp)
+        if (k < comp)
             output[i * comp + k] = data[i * comp + k] / 255.0f;
     }
     STBI_FREE(data);
@@ -1504,43 +1498,43 @@ static float* stbi__ldr_to_hdr(stbi_uc* data, int x, int y, int comp)
 #endif
 
 #ifndef STBI_NO_HDR
-#define stbi__float2int(x) (( int )(x))
+#define stbi__float2int(x) ((int)(x))
 static stbi_uc* stbi__hdr_to_ldr(float* data, int x, int y, int comp)
 {
     int i, k, n;
     stbi_uc* output;
-    if(!data)
+    if (!data)
         return NULL;
-    output = ( stbi_uc* )stbi__malloc_mad3(x, y, comp, 0);
-    if(output == NULL)
+    output = (stbi_uc*)stbi__malloc_mad3(x, y, comp, 0);
+    if (output == NULL)
     {
         STBI_FREE(data);
         return stbi__errpuc("outofmem", "Out of memory");
     }
     // compute number of non-alpha components
-    if(comp & 1)
+    if (comp & 1)
         n = comp;
     else
         n = comp - 1;
-    for(i = 0; i < x * y; ++i)
+    for (i = 0; i < x * y; ++i)
     {
-        for(k = 0; k < n; ++k)
+        for (k = 0; k < n; ++k)
         {
-            float z = ( float )pow((double)data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
-            if(z < 0)
+            float z = (float)pow((double)data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+            if (z < 0)
                 z = 0;
-            if(z > 255)
+            if (z > 255)
                 z = 255;
-            output[i * comp + k] = ( stbi_uc )stbi__float2int(z);
+            output[i * comp + k] = (stbi_uc)stbi__float2int(z);
         }
-        if(k < comp)
+        if (k < comp)
         {
             float z = data[i * comp + k] * 255 + 0.5f;
-            if(z < 0)
+            if (z < 0)
                 z = 0;
-            if(z > 255)
+            if (z > 255)
                 z = 255;
-            output[i * comp + k] = ( stbi_uc )stbi__float2int(z);
+            output[i * comp + k] = (stbi_uc)stbi__float2int(z);
         }
     }
     STBI_FREE(data);
@@ -1572,7 +1566,7 @@ static stbi_uc* stbi__hdr_to_ldr(float* data, int x, int y, int comp)
 #ifndef STBI_NO_JPEG
 
 // huffman decoding acceleration
-#define FAST_BITS 9    // larger handles more cases; smaller stomps less cache
+#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache
 
 typedef struct
 {
@@ -1582,7 +1576,7 @@ typedef struct
     stbi_uc values[256];
     stbi_uc size[257];
     unsigned int maxcode[18];
-    int delta[17];    // old 'firstsymbol' - old 'firstcode'
+    int delta[17]; // old 'firstsymbol' - old 'firstcode'
 } stbi__huffman;
 
 typedef struct
@@ -1611,14 +1605,14 @@ typedef struct
         stbi_uc* data;
         void *raw_data, *raw_coeff;
         stbi_uc* linebuf;
-        short* coeff;    // progressive only
-        int coeff_w, coeff_h;    // number of 8x8 coefficient blocks
+        short* coeff;         // progressive only
+        int coeff_w, coeff_h; // number of 8x8 coefficient blocks
     } img_comp[4];
 
-    stbi__uint32 code_buffer;    // jpeg entropy-coded buffer
-    int code_bits;    // number of valid bits
-    unsigned char marker;    // marker seen while filling entropy buffer
-    int nomore;    // flag if we saw a marker so must stop
+    stbi__uint32 code_buffer; // jpeg entropy-coded buffer
+    int code_bits;            // number of valid bits
+    unsigned char marker;     // marker seen while filling entropy buffer
+    int nomore;               // flag if we saw a marker so must stop
 
     int progressive;
     int spec_start;
@@ -1627,7 +1621,7 @@ typedef struct
     int succ_low;
     int eob_run;
     int jfif;
-    int app14_color_transform;    // Adobe APP14 tag
+    int app14_color_transform; // Adobe APP14 tag
     int rgb;
 
     int scan_n, order[4];
@@ -1645,23 +1639,23 @@ static int stbi__build_huffman(stbi__huffman* h, int* count)
     int i, j, k = 0;
     unsigned int code;
     // build size list for each symbol (from JPEG spec)
-    for(i = 0; i < 16; ++i)
-        for(j = 0; j < count[i]; ++j)
+    for (i = 0; i < 16; ++i)
+        for (j = 0; j < count[i]; ++j)
             h->size[k++] = (stbi_uc)(i + 1);
     h->size[k] = 0;
 
     // compute actual symbols (from jpeg spec)
     code = 0;
     k = 0;
-    for(j = 1; j <= 16; ++j)
+    for (j = 1; j <= 16; ++j)
     {
         // compute delta to add to code to compute symbol id
         h->delta[j] = k - code;
-        if(h->size[k] == j)
+        if (h->size[k] == j)
         {
-            while(h->size[k] == j)
+            while (h->size[k] == j)
                 h->code[k++] = (stbi__uint16)(code++);
-            if(code - 1 >= (1u << j))
+            if (code - 1 >= (1u << j))
                 return stbi__err("bad code lengths", "Corrupt JPEG");
         }
         // compute largest code + 1 for this size, preshifted as needed later
@@ -1672,16 +1666,16 @@ static int stbi__build_huffman(stbi__huffman* h, int* count)
 
     // build non-spec acceleration table; 255 is flag for not-accelerated
     memset(h->fast, 255, 1 << FAST_BITS);
-    for(i = 0; i < k; ++i)
+    for (i = 0; i < k; ++i)
     {
         int s = h->size[i];
-        if(s <= FAST_BITS)
+        if (s <= FAST_BITS)
         {
             int c = h->code[i] << (FAST_BITS - s);
             int m = 1 << (FAST_BITS - s);
-            for(j = 0; j < m; ++j)
+            for (j = 0; j < m; ++j)
             {
-                h->fast[c + j] = ( stbi_uc )i;
+                h->fast[c + j] = (stbi_uc)i;
             }
         }
     }
@@ -1693,26 +1687,26 @@ static int stbi__build_huffman(stbi__huffman* h, int* count)
 static void stbi__build_fast_ac(stbi__int16* fast_ac, stbi__huffman* h)
 {
     int i;
-    for(i = 0; i < (1 << FAST_BITS); ++i)
+    for (i = 0; i < (1 << FAST_BITS); ++i)
     {
         stbi_uc fast = h->fast[i];
         fast_ac[i] = 0;
-        if(fast < 255)
+        if (fast < 255)
         {
             int rs = h->values[fast];
             int run = (rs >> 4) & 15;
             int magbits = rs & 15;
             int len = h->size[fast];
 
-            if(magbits && len + magbits <= FAST_BITS)
+            if (magbits && len + magbits <= FAST_BITS)
             {
                 // magnitude code followed by receive_extend code
                 int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
                 int m = 1 << (magbits - 1);
-                if(k < m)
+                if (k < m)
                     k += (~0U << magbits) + 1;
                 // if the result is small enough, we can fit it in fast_ac table
-                if(k >= -128 && k <= 127)
+                if (k >= -128 && k <= 127)
                     fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits));
             }
         }
@@ -1724,25 +1718,25 @@ static void stbi__grow_buffer_unsafe(stbi__jpeg* j)
     do
     {
         unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
-        if(b == 0xff)
+        if (b == 0xff)
         {
             int c = stbi__get8(j->s);
-            while(c == 0xff)
-                c = stbi__get8(j->s);    // consume fill bytes
-            if(c != 0)
+            while (c == 0xff)
+                c = stbi__get8(j->s); // consume fill bytes
+            if (c != 0)
             {
-                j->marker = ( unsigned char )c;
+                j->marker = (unsigned char)c;
                 j->nomore = 1;
                 return;
             }
         }
         j->code_buffer |= b << (24 - j->code_bits);
         j->code_bits += 8;
-    } while(j->code_bits <= 24);
+    } while (j->code_bits <= 24);
 }
 
 // (1 << n) - 1
-static const stbi__uint32 stbi__bmask[17] = {0,   1,    3,    7,    15,   31,    63,    127,  255,
+static const stbi__uint32 stbi__bmask[17] = {0, 1, 3, 7, 15, 31, 63, 127, 255,
                                              511, 1023, 2047, 4095, 8191, 16383, 32767, 65535};
 
 // decode a jpeg huffman value from the bitstream
@@ -1751,17 +1745,17 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h)
     unsigned int temp;
     int c, k;
 
-    if(j->code_bits < 16)
+    if (j->code_bits < 16)
         stbi__grow_buffer_unsafe(j);
 
     // look at the top FAST_BITS and determine what symbol ID it is,
     // if the code is <= FAST_BITS
     c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
     k = h->fast[c];
-    if(k < 255)
+    if (k < 255)
     {
         int s = h->size[k];
-        if(s > j->code_bits)
+        if (s > j->code_bits)
             return -1;
         j->code_buffer <<= s;
         j->code_bits -= s;
@@ -1775,17 +1769,17 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h)
     // wants to be compared against something shifted to have 16;
     // that way we don't need to shift inside the loop.
     temp = j->code_buffer >> 16;
-    for(k = FAST_BITS + 1;; ++k)
-        if(temp < h->maxcode[k])
+    for (k = FAST_BITS + 1;; ++k)
+        if (temp < h->maxcode[k])
             break;
-    if(k == 17)
+    if (k == 17)
     {
         // error! code not found
         j->code_bits -= 16;
         return -1;
     }
 
-    if(k > j->code_bits)
+    if (k > j->code_bits)
         return -1;
 
     // convert the huffman code to the symbol id
@@ -1799,7 +1793,7 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg* j, stbi__huffman* h)
 }
 
 // bias[n] = (-1<<n) + 1
-static const int stbi__jbias[16] = {0,    -1,   -3,    -7,    -15,   -31,   -63,    -127,
+static const int stbi__jbias[16] = {0, -1, -3, -7, -15, -31, -63, -127,
                                     -255, -511, -1023, -2047, -4095, -8191, -16383, -32767};
 
 // combined JPEG 'receive' and JPEG 'extend', since baseline
@@ -1808,12 +1802,12 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg* j, int n)
 {
     unsigned int k;
     int sgn;
-    if(j->code_bits < n)
+    if (j->code_bits < n)
         stbi__grow_buffer_unsafe(j);
 
-    sgn = ( stbi__int32 )j->code_buffer >> 31;    // sign bit is always in MSB
+    sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
     k = stbi_lrot(j->code_buffer, n);
-    STBI_ASSERT(n >= 0 && n < ( int )(sizeof(stbi__bmask) / sizeof(*stbi__bmask)));
+    STBI_ASSERT(n >= 0 && n < (int)(sizeof(stbi__bmask) / sizeof(*stbi__bmask)));
     j->code_buffer = k & ~stbi__bmask[n];
     k &= stbi__bmask[n];
     j->code_bits -= n;
@@ -1824,7 +1818,7 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg* j, int n)
 stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg* j, int n)
 {
     unsigned int k;
-    if(j->code_bits < n)
+    if (j->code_bits < n)
         stbi__grow_buffer_unsafe(j);
     k = stbi_lrot(j->code_buffer, n);
     j->code_buffer = k & ~stbi__bmask[n];
@@ -1836,7 +1830,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg* j, int n)
 stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg* j)
 {
     unsigned int k;
-    if(j->code_bits < 1)
+    if (j->code_bits < 1)
         stbi__grow_buffer_unsafe(j);
     k = j->code_buffer;
     j->code_buffer <<= 1;
@@ -1860,10 +1854,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman*
     int diff, dc, k;
     int t;
 
-    if(j->code_bits < 16)
+    if (j->code_bits < 16)
         stbi__grow_buffer_unsafe(j);
     t = stbi__jpeg_huff_decode(j, hdc);
-    if(t < 0)
+    if (t < 0)
         return stbi__err("bad huffman code", "Corrupt JPEG");
 
     // 0 all the ac values now so we can do it 32-bits at a time
@@ -1872,7 +1866,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman*
     diff = t ? stbi__extend_receive(j, t) : 0;
     dc = j->img_comp[b].dc_pred + diff;
     j->img_comp[b].dc_pred = dc;
-    data[0] = ( short )(dc * dequant[0]);
+    data[0] = (short)(dc * dequant[0]);
 
     // decode AC components, see JPEG spec
     k = 1;
@@ -1880,31 +1874,31 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman*
     {
         unsigned int zig;
         int c, r, s;
-        if(j->code_bits < 16)
+        if (j->code_bits < 16)
             stbi__grow_buffer_unsafe(j);
         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
         r = fac[c];
-        if(r)
-        {    // fast-AC path
-            k += (r >> 4) & 15;    // run
-            s = r & 15;    // combined length
+        if (r)
+        {                       // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15;         // combined length
             j->code_buffer <<= s;
             j->code_bits -= s;
             // decode into unzigzag'd location
             zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = ( short )((r >> 8) * dequant[zig]);
+            data[zig] = (short)((r >> 8) * dequant[zig]);
         }
         else
         {
             int rs = stbi__jpeg_huff_decode(j, hac);
-            if(rs < 0)
+            if (rs < 0)
                 return stbi__err("bad huffman code", "Corrupt JPEG");
             s = rs & 15;
             r = rs >> 4;
-            if(s == 0)
+            if (s == 0)
             {
-                if(rs != 0xf0)
-                    break;    // end block
+                if (rs != 0xf0)
+                    break; // end block
                 k += 16;
             }
             else
@@ -1912,10 +1906,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg* j, short data[64], stbi__huffman*
                 k += r;
                 // decode into unzigzag'd location
                 zig = stbi__jpeg_dezigzag[k++];
-                data[zig] = ( short )(stbi__extend_receive(j, s) * dequant[zig]);
+                data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]);
             }
         }
-    } while(k < 64);
+    } while (k < 64);
     return 1;
 }
 
@@ -1923,28 +1917,28 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg* j, short data[64], stbi__
 {
     int diff, dc;
     int t;
-    if(j->spec_end != 0)
+    if (j->spec_end != 0)
         return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 
-    if(j->code_bits < 16)
+    if (j->code_bits < 16)
         stbi__grow_buffer_unsafe(j);
 
-    if(j->succ_high == 0)
+    if (j->succ_high == 0)
     {
         // first scan for DC coefficient, must be first
-        memset(data, 0, 64 * sizeof(data[0]));    // 0 all the ac values now
+        memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now
         t = stbi__jpeg_huff_decode(j, hdc);
         diff = t ? stbi__extend_receive(j, t) : 0;
 
         dc = j->img_comp[b].dc_pred + diff;
         j->img_comp[b].dc_pred = dc;
-        data[0] = ( short )(dc << j->succ_low);
+        data[0] = (short)(dc << j->succ_low);
     }
     else
     {
         // refinement scan for DC coefficient
-        if(stbi__jpeg_get_bit(j))
-            data[0] += ( short )(1 << j->succ_low);
+        if (stbi__jpeg_get_bit(j))
+            data[0] += (short)(1 << j->succ_low);
     }
     return 1;
 }
@@ -1954,14 +1948,14 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg* j, short data[64], stbi__
 static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__huffman* hac, stbi__int16* fac)
 {
     int k;
-    if(j->spec_start == 0)
+    if (j->spec_start == 0)
         return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 
-    if(j->succ_high == 0)
+    if (j->succ_high == 0)
     {
         int shift = j->succ_low;
 
-        if(j->eob_run)
+        if (j->eob_run)
         {
             --j->eob_run;
             return 1;
@@ -1972,32 +1966,32 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
         {
             unsigned int zig;
             int c, r, s;
-            if(j->code_bits < 16)
+            if (j->code_bits < 16)
                 stbi__grow_buffer_unsafe(j);
             c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
             r = fac[c];
-            if(r)
-            {    // fast-AC path
-                k += (r >> 4) & 15;    // run
-                s = r & 15;    // combined length
+            if (r)
+            {                       // fast-AC path
+                k += (r >> 4) & 15; // run
+                s = r & 15;         // combined length
                 j->code_buffer <<= s;
                 j->code_bits -= s;
                 zig = stbi__jpeg_dezigzag[k++];
-                data[zig] = ( short )((r >> 8) << shift);
+                data[zig] = (short)((r >> 8) << shift);
             }
             else
             {
                 int rs = stbi__jpeg_huff_decode(j, hac);
-                if(rs < 0)
+                if (rs < 0)
                     return stbi__err("bad huffman code", "Corrupt JPEG");
                 s = rs & 15;
                 r = rs >> 4;
-                if(s == 0)
+                if (s == 0)
                 {
-                    if(r < 15)
+                    if (r < 15)
                     {
                         j->eob_run = (1 << r);
-                        if(r)
+                        if (r)
                             j->eob_run += stbi__jpeg_get_bits(j, r);
                         --j->eob_run;
                         break;
@@ -2008,28 +2002,28 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
                 {
                     k += r;
                     zig = stbi__jpeg_dezigzag[k++];
-                    data[zig] = ( short )(stbi__extend_receive(j, s) << shift);
+                    data[zig] = (short)(stbi__extend_receive(j, s) << shift);
                 }
             }
-        } while(k <= j->spec_end);
+        } while (k <= j->spec_end);
     }
     else
     {
         // refinement scan for these AC coefficients
 
-        short bit = ( short )(1 << j->succ_low);
+        short bit = (short)(1 << j->succ_low);
 
-        if(j->eob_run)
+        if (j->eob_run)
         {
             --j->eob_run;
-            for(k = j->spec_start; k <= j->spec_end; ++k)
+            for (k = j->spec_start; k <= j->spec_end; ++k)
             {
                 short* p = &data[stbi__jpeg_dezigzag[k]];
-                if(*p != 0)
-                    if(stbi__jpeg_get_bit(j))
-                        if((*p & bit) == 0)
+                if (*p != 0)
+                    if (stbi__jpeg_get_bit(j))
+                        if ((*p & bit) == 0)
                         {
-                            if(*p > 0)
+                            if (*p > 0)
                                 *p += bit;
                             else
                                 *p -= bit;
@@ -2043,19 +2037,19 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
             {
                 int r, s;
                 int rs = stbi__jpeg_huff_decode(
-                    j, hac);    // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
-                if(rs < 0)
+                    j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+                if (rs < 0)
                     return stbi__err("bad huffman code", "Corrupt JPEG");
                 s = rs & 15;
                 r = rs >> 4;
-                if(s == 0)
+                if (s == 0)
                 {
-                    if(r < 15)
+                    if (r < 15)
                     {
                         j->eob_run = (1 << r) - 1;
-                        if(r)
+                        if (r)
                             j->eob_run += stbi__jpeg_get_bits(j, r);
-                        r = 64;    // force end of block
+                        r = 64; // force end of block
                     }
                     else
                     {
@@ -2066,25 +2060,25 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
                 }
                 else
                 {
-                    if(s != 1)
+                    if (s != 1)
                         return stbi__err("bad huffman code", "Corrupt JPEG");
                     // sign bit
-                    if(stbi__jpeg_get_bit(j))
+                    if (stbi__jpeg_get_bit(j))
                         s = bit;
                     else
                         s = -bit;
                 }
 
                 // advance by r
-                while(k <= j->spec_end)
+                while (k <= j->spec_end)
                 {
                     short* p = &data[stbi__jpeg_dezigzag[k++]];
-                    if(*p != 0)
+                    if (*p != 0)
                     {
-                        if(stbi__jpeg_get_bit(j))
-                            if((*p & bit) == 0)
+                        if (stbi__jpeg_get_bit(j))
+                            if ((*p & bit) == 0)
                             {
-                                if(*p > 0)
+                                if (*p > 0)
                                     *p += bit;
                                 else
                                     *p -= bit;
@@ -2092,15 +2086,15 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
                     }
                     else
                     {
-                        if(r == 0)
+                        if (r == 0)
                         {
-                            *p = ( short )s;
+                            *p = (short)s;
                             break;
                         }
                         --r;
                     }
                 }
-            } while(k <= j->spec_end);
+            } while (k <= j->spec_end);
         }
     }
     return 1;
@@ -2110,18 +2104,18 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg* j, short data[64], stbi__
 stbi_inline static stbi_uc stbi__clamp(int x)
 {
     // trick to use a single test to catch both cases
-    if(( unsigned int )x > 255)
+    if ((unsigned int)x > 255)
     {
-        if(x < 0)
+        if (x < 0)
             return 0;
-        if(x > 255)
+        if (x > 255)
             return 255;
     }
-    return ( stbi_uc )x;
+    return (stbi_uc)x;
 }
 
-#define stbi__f2f(x) (( int )((( x )*4096 + 0.5)))
-#define stbi__fsh(x) (( x )*4096)
+#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
+#define stbi__fsh(x) ((x)*4096)
 
 // derived from jidctint -- DCT_ISLOW
 #define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7)       \
@@ -2168,10 +2162,10 @@ static void stbi__idct_block(stbi_uc* out, int out_stride, short data[64])
     short* d = data;
 
     // columns
-    for(i = 0; i < 8; ++i, ++d, ++v)
+    for (i = 0; i < 8; ++i, ++d, ++v)
     {
         // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-        if(d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0)
+        if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0)
         {
             //    no shortcut                 0     seconds
             //    (1|2|3|4|5|6|7)==0          0     seconds
@@ -2200,7 +2194,7 @@ static void stbi__idct_block(stbi_uc* out, int out_stride, short data[64])
         }
     }
 
-    for(i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride)
+    for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride)
     {
         // no fast case since the first 1D IDCT spread components out
         STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
@@ -2330,14 +2324,14 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
     __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
 
     // load
-    row0 = _mm_load_si128(( const __m128i* )(data + 0 * 8));
-    row1 = _mm_load_si128(( const __m128i* )(data + 1 * 8));
-    row2 = _mm_load_si128(( const __m128i* )(data + 2 * 8));
-    row3 = _mm_load_si128(( const __m128i* )(data + 3 * 8));
-    row4 = _mm_load_si128(( const __m128i* )(data + 4 * 8));
-    row5 = _mm_load_si128(( const __m128i* )(data + 5 * 8));
-    row6 = _mm_load_si128(( const __m128i* )(data + 6 * 8));
-    row7 = _mm_load_si128(( const __m128i* )(data + 7 * 8));
+    row0 = _mm_load_si128((const __m128i*)(data + 0 * 8));
+    row1 = _mm_load_si128((const __m128i*)(data + 1 * 8));
+    row2 = _mm_load_si128((const __m128i*)(data + 2 * 8));
+    row3 = _mm_load_si128((const __m128i*)(data + 3 * 8));
+    row4 = _mm_load_si128((const __m128i*)(data + 4 * 8));
+    row5 = _mm_load_si128((const __m128i*)(data + 5 * 8));
+    row6 = _mm_load_si128((const __m128i*)(data + 6 * 8));
+    row7 = _mm_load_si128((const __m128i*)(data + 7 * 8));
 
     // column pass
     dct_pass(bias_0, 10);
@@ -2367,39 +2361,39 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
 
     {
         // pack
-        __m128i p0 = _mm_packus_epi16(row0, row1);    // a0a1a2a3...a7b0b1b2b3...b7
+        __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
         __m128i p1 = _mm_packus_epi16(row2, row3);
         __m128i p2 = _mm_packus_epi16(row4, row5);
         __m128i p3 = _mm_packus_epi16(row6, row7);
 
         // 8bit 8x8 transpose pass 1
-        dct_interleave8(p0, p2);    // a0e0a1e1...
-        dct_interleave8(p1, p3);    // c0g0c1g1...
+        dct_interleave8(p0, p2); // a0e0a1e1...
+        dct_interleave8(p1, p3); // c0g0c1g1...
 
         // transpose pass 2
-        dct_interleave8(p0, p1);    // a0c0e0g0...
-        dct_interleave8(p2, p3);    // b0d0f0h0...
+        dct_interleave8(p0, p1); // a0c0e0g0...
+        dct_interleave8(p2, p3); // b0d0f0h0...
 
         // transpose pass 3
-        dct_interleave8(p0, p2);    // a0b0c0d0...
-        dct_interleave8(p1, p3);    // a4b4c4d4...
+        dct_interleave8(p0, p2); // a0b0c0d0...
+        dct_interleave8(p1, p3); // a4b4c4d4...
 
         // store
-        _mm_storel_epi64(( __m128i* )out, p0);
+        _mm_storel_epi64((__m128i*)out, p0);
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p0, 0x4e));
+        _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p0, 0x4e));
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, p2);
+        _mm_storel_epi64((__m128i*)out, p2);
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p2, 0x4e));
+        _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p2, 0x4e));
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, p1);
+        _mm_storel_epi64((__m128i*)out, p1);
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p1, 0x4e));
+        _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p1, 0x4e));
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, p3);
+        _mm_storel_epi64((__m128i*)out, p3);
         out += out_stride;
-        _mm_storel_epi64(( __m128i* )out, _mm_shuffle_epi32(p3, 0x4e));
+        _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi32(p3, 0x4e));
     }
 
 #undef dct_const
@@ -2413,7 +2407,7 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
 #undef dct_pass
 }
 
-#endif    // STBI_SSE2
+#endif // STBI_SSE2
 
 #ifdef STBI_NEON
 
@@ -2548,19 +2542,19 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
     }
 
         // pass 1
-        dct_trn16(row0, row1);    // a0b0a2b2a4b4a6b6
+        dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
         dct_trn16(row2, row3);
         dct_trn16(row4, row5);
         dct_trn16(row6, row7);
 
         // pass 2
-        dct_trn32(row0, row2);    // a0b0c0d0a4b4c4d4
+        dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
         dct_trn32(row1, row3);
         dct_trn32(row4, row6);
         dct_trn32(row5, row7);
 
         // pass 3
-        dct_trn64(row0, row4);    // a0b0c0d0e0f0g0h0
+        dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
         dct_trn64(row1, row5);
         dct_trn64(row2, row6);
         dct_trn64(row3, row7);
@@ -2659,7 +2653,7 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
 #undef dct_pass
 }
 
-#endif    // STBI_NEON
+#endif // STBI_NEON
 
 #define STBI__MARKER_none 0xff
 // if there's a pending marker from the entropy stream, return that
@@ -2668,17 +2662,17 @@ static void stbi__idct_simd(stbi_uc* out, int out_stride, short data[64])
 static stbi_uc stbi__get_marker(stbi__jpeg* j)
 {
     stbi_uc x;
-    if(j->marker != STBI__MARKER_none)
+    if (j->marker != STBI__MARKER_none)
     {
         x = j->marker;
         j->marker = STBI__MARKER_none;
         return x;
     }
     x = stbi__get8(j->s);
-    if(x != 0xff)
+    if (x != 0xff)
         return STBI__MARKER_none;
-    while(x == 0xff)
-        x = stbi__get8(j->s);    // consume repeated 0xff fill bytes
+    while (x == 0xff)
+        x = stbi__get8(j->s); // consume repeated 0xff fill bytes
     return x;
 }
 
@@ -2704,9 +2698,9 @@ static void stbi__jpeg_reset(stbi__jpeg* j)
 static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
 {
     stbi__jpeg_reset(z);
-    if(!z->progressive)
+    if (!z->progressive)
     {
-        if(z->scan_n == 1)
+        if (z->scan_n == 1)
         {
             int i, j;
             STBI_SIMD_ALIGN(short, data[64]);
@@ -2717,24 +2711,24 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
             // component has, independent of interleaved MCU blocking and such
             int w = (z->img_comp[n].x + 7) >> 3;
             int h = (z->img_comp[n].y + 7) >> 3;
-            for(j = 0; j < h; ++j)
+            for (j = 0; j < h; ++j)
             {
-                for(i = 0; i < w; ++i)
+                for (i = 0; i < w; ++i)
                 {
                     int ha = z->img_comp[n].ha;
-                    if(!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
-                                                z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
+                    if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
+                                                 z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
                         return 0;
                     z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2,
                                          data);
                     // every data block is an MCU, so countdown the restart interval
-                    if(--z->todo <= 0)
+                    if (--z->todo <= 0)
                     {
-                        if(z->code_bits < 24)
+                        if (z->code_bits < 24)
                             stbi__grow_buffer_unsafe(z);
                         // if it's NOT a restart, then just bail, so we get corrupt data
                         // rather than no data
-                        if(!STBI__RESTART(z->marker))
+                        if (!STBI__RESTART(z->marker))
                             return 1;
                         stbi__jpeg_reset(z);
                     }
@@ -2743,28 +2737,28 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
             return 1;
         }
         else
-        {    // interleaved
+        { // interleaved
             int i, j, k, x, y;
             STBI_SIMD_ALIGN(short, data[64]);
-            for(j = 0; j < z->img_mcu_y; ++j)
+            for (j = 0; j < z->img_mcu_y; ++j)
             {
-                for(i = 0; i < z->img_mcu_x; ++i)
+                for (i = 0; i < z->img_mcu_x; ++i)
                 {
                     // scan an interleaved mcu... process scan_n components in order
-                    for(k = 0; k < z->scan_n; ++k)
+                    for (k = 0; k < z->scan_n; ++k)
                     {
                         int n = z->order[k];
                         // scan out an mcu's worth of this component; that's just determined
                         // by the basic H and V specified for the component
-                        for(y = 0; y < z->img_comp[n].v; ++y)
+                        for (y = 0; y < z->img_comp[n].v; ++y)
                         {
-                            for(x = 0; x < z->img_comp[n].h; ++x)
+                            for (x = 0; x < z->img_comp[n].h; ++x)
                             {
                                 int x2 = (i * z->img_comp[n].h + x) * 8;
                                 int y2 = (j * z->img_comp[n].v + y) * 8;
                                 int ha = z->img_comp[n].ha;
-                                if(!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
-                                                            z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
+                                if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
+                                                             z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
                                     return 0;
                                 z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2,
                                                      z->img_comp[n].w2, data);
@@ -2773,11 +2767,11 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
                     }
                     // after all interleaved components, that's an interleaved MCU,
                     // so now count down the restart interval
-                    if(--z->todo <= 0)
+                    if (--z->todo <= 0)
                     {
-                        if(z->code_bits < 24)
+                        if (z->code_bits < 24)
                             stbi__grow_buffer_unsafe(z);
-                        if(!STBI__RESTART(z->marker))
+                        if (!STBI__RESTART(z->marker))
                             return 1;
                         stbi__jpeg_reset(z);
                     }
@@ -2788,7 +2782,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
     }
     else
     {
-        if(z->scan_n == 1)
+        if (z->scan_n == 1)
         {
             int i, j;
             int n = z->order[0];
@@ -2798,28 +2792,28 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
             // component has, independent of interleaved MCU blocking and such
             int w = (z->img_comp[n].x + 7) >> 3;
             int h = (z->img_comp[n].y + 7) >> 3;
-            for(j = 0; j < h; ++j)
+            for (j = 0; j < h; ++j)
             {
-                for(i = 0; i < w; ++i)
+                for (i = 0; i < w; ++i)
                 {
                     short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-                    if(z->spec_start == 0)
+                    if (z->spec_start == 0)
                     {
-                        if(!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
                             return 0;
                     }
                     else
                     {
                         int ha = z->img_comp[n].ha;
-                        if(!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                        if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
                             return 0;
                     }
                     // every data block is an MCU, so countdown the restart interval
-                    if(--z->todo <= 0)
+                    if (--z->todo <= 0)
                     {
-                        if(z->code_bits < 24)
+                        if (z->code_bits < 24)
                             stbi__grow_buffer_unsafe(z);
-                        if(!STBI__RESTART(z->marker))
+                        if (!STBI__RESTART(z->marker))
                             return 1;
                         stbi__jpeg_reset(z);
                     }
@@ -2828,37 +2822,37 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
             return 1;
         }
         else
-        {    // interleaved
+        { // interleaved
             int i, j, k, x, y;
-            for(j = 0; j < z->img_mcu_y; ++j)
+            for (j = 0; j < z->img_mcu_y; ++j)
             {
-                for(i = 0; i < z->img_mcu_x; ++i)
+                for (i = 0; i < z->img_mcu_x; ++i)
                 {
                     // scan an interleaved mcu... process scan_n components in order
-                    for(k = 0; k < z->scan_n; ++k)
+                    for (k = 0; k < z->scan_n; ++k)
                     {
                         int n = z->order[k];
                         // scan out an mcu's worth of this component; that's just determined
                         // by the basic H and V specified for the component
-                        for(y = 0; y < z->img_comp[n].v; ++y)
+                        for (y = 0; y < z->img_comp[n].v; ++y)
                         {
-                            for(x = 0; x < z->img_comp[n].h; ++x)
+                            for (x = 0; x < z->img_comp[n].h; ++x)
                             {
                                 int x2 = (i * z->img_comp[n].h + x);
                                 int y2 = (j * z->img_comp[n].v + y);
                                 short* data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
-                                if(!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                                if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
                                     return 0;
                             }
                         }
                     }
                     // after all interleaved components, that's an interleaved MCU,
                     // so now count down the restart interval
-                    if(--z->todo <= 0)
+                    if (--z->todo <= 0)
                     {
-                        if(z->code_bits < 24)
+                        if (z->code_bits < 24)
                             stbi__grow_buffer_unsafe(z);
-                        if(!STBI__RESTART(z->marker))
+                        if (!STBI__RESTART(z->marker))
                             return 1;
                         stbi__jpeg_reset(z);
                     }
@@ -2872,23 +2866,23 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg* z)
 static void stbi__jpeg_dequantize(short* data, stbi__uint16* dequant)
 {
     int i;
-    for(i = 0; i < 64; ++i)
+    for (i = 0; i < 64; ++i)
         data[i] *= dequant[i];
 }
 
 static void stbi__jpeg_finish(stbi__jpeg* z)
 {
-    if(z->progressive)
+    if (z->progressive)
     {
         // dequantize and idct the data
         int i, j, n;
-        for(n = 0; n < z->s->img_n; ++n)
+        for (n = 0; n < z->s->img_n; ++n)
         {
             int w = (z->img_comp[n].x + 7) >> 3;
             int h = (z->img_comp[n].y + 7) >> 3;
-            for(j = 0; j < h; ++j)
+            for (j = 0; j < h; ++j)
             {
-                for(i = 0; i < w; ++i)
+                for (i = 0; i < w; ++i)
                 {
                     short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
                     stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
@@ -2903,114 +2897,113 @@ static void stbi__jpeg_finish(stbi__jpeg* z)
 static int stbi__process_marker(stbi__jpeg* z, int m)
 {
     int L;
-    switch(m)
+    switch (m)
     {
-        case STBI__MARKER_none:    // no marker found
-            return stbi__err("expected marker", "Corrupt JPEG");
+    case STBI__MARKER_none: // no marker found
+        return stbi__err("expected marker", "Corrupt JPEG");
 
-        case 0xDD:    // DRI - specify restart interval
-            if(stbi__get16be(z->s) != 4)
-                return stbi__err("bad DRI len", "Corrupt JPEG");
-            z->restart_interval = stbi__get16be(z->s);
-            return 1;
+    case 0xDD: // DRI - specify restart interval
+        if (stbi__get16be(z->s) != 4)
+            return stbi__err("bad DRI len", "Corrupt JPEG");
+        z->restart_interval = stbi__get16be(z->s);
+        return 1;
+
+    case 0xDB: // DQT - define quantization table
+        L = stbi__get16be(z->s) - 2;
+        while (L > 0)
+        {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15, i;
+            if (p != 0 && p != 1)
+                return stbi__err("bad DQT type", "Corrupt JPEG");
+            if (t > 3)
+                return stbi__err("bad DQT table", "Corrupt JPEG");
+
+            for (i = 0; i < 64; ++i)
+                z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+        }
+        return L == 0;
 
-        case 0xDB:    // DQT - define quantization table
-            L = stbi__get16be(z->s) - 2;
-            while(L > 0)
+    case 0xC4: // DHT - define huffman table
+        L = stbi__get16be(z->s) - 2;
+        while (L > 0)
+        {
+            stbi_uc* v;
+            int sizes[16], i, n = 0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3)
+                return stbi__err("bad DHT header", "Corrupt JPEG");
+            for (i = 0; i < 16; ++i)
             {
-                int q = stbi__get8(z->s);
-                int p = q >> 4, sixteen = (p != 0);
-                int t = q & 15, i;
-                if(p != 0 && p != 1)
-                    return stbi__err("bad DQT type", "Corrupt JPEG");
-                if(t > 3)
-                    return stbi__err("bad DQT table", "Corrupt JPEG");
-
-                for(i = 0; i < 64; ++i)
-                    z->dequant[t][stbi__jpeg_dezigzag[i]] =
-                        (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
-                L -= (sixteen ? 129 : 65);
+                sizes[i] = stbi__get8(z->s);
+                n += sizes[i];
             }
-            return L == 0;
-
-        case 0xC4:    // DHT - define huffman table
-            L = stbi__get16be(z->s) - 2;
-            while(L > 0)
+            L -= 17;
+            if (tc == 0)
             {
-                stbi_uc* v;
-                int sizes[16], i, n = 0;
-                int q = stbi__get8(z->s);
-                int tc = q >> 4;
-                int th = q & 15;
-                if(tc > 1 || th > 3)
-                    return stbi__err("bad DHT header", "Corrupt JPEG");
-                for(i = 0; i < 16; ++i)
-                {
-                    sizes[i] = stbi__get8(z->s);
-                    n += sizes[i];
-                }
-                L -= 17;
-                if(tc == 0)
-                {
-                    if(!stbi__build_huffman(z->huff_dc + th, sizes))
-                        return 0;
-                    v = z->huff_dc[th].values;
-                }
-                else
-                {
-                    if(!stbi__build_huffman(z->huff_ac + th, sizes))
-                        return 0;
-                    v = z->huff_ac[th].values;
-                }
-                for(i = 0; i < n; ++i)
-                    v[i] = stbi__get8(z->s);
-                if(tc != 0)
-                    stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
-                L -= n;
+                if (!stbi__build_huffman(z->huff_dc + th, sizes))
+                    return 0;
+                v = z->huff_dc[th].values;
             }
-            return L == 0;
+            else
+            {
+                if (!stbi__build_huffman(z->huff_ac + th, sizes))
+                    return 0;
+                v = z->huff_ac[th].values;
+            }
+            for (i = 0; i < n; ++i)
+                v[i] = stbi__get8(z->s);
+            if (tc != 0)
+                stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+        }
+        return L == 0;
     }
 
     // check for comment block or APP blocks
-    if((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
+    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
     {
         L = stbi__get16be(z->s);
-        if(L < 2)
+        if (L < 2)
         {
-            if(m == 0xFE)
+            if (m == 0xFE)
                 return stbi__err("bad COM len", "Corrupt JPEG");
             else
                 return stbi__err("bad APP len", "Corrupt JPEG");
         }
         L -= 2;
 
-        if(m == 0xE0 && L >= 5)
-        {    // JFIF APP0 segment
+        if (m == 0xE0 && L >= 5)
+        { // JFIF APP0 segment
             static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'};
             int ok = 1;
             int i;
-            for(i = 0; i < 5; ++i)
-                if(stbi__get8(z->s) != tag[i])
+            for (i = 0; i < 5; ++i)
+                if (stbi__get8(z->s) != tag[i])
                     ok = 0;
             L -= 5;
-            if(ok)
+            if (ok)
                 z->jfif = 1;
         }
-        else if(m == 0xEE && L >= 12)
-        {    // Adobe APP14 segment
+        else if (m == 0xEE && L >= 12)
+        { // Adobe APP14 segment
             static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'};
             int ok = 1;
             int i;
-            for(i = 0; i < 6; ++i)
-                if(stbi__get8(z->s) != tag[i])
+            for (i = 0; i < 6; ++i)
+                if (stbi__get8(z->s) != tag[i])
                     ok = 0;
             L -= 6;
-            if(ok)
+            if (ok)
             {
-                stbi__get8(z->s);    // version
-                stbi__get16be(z->s);    // flags0
-                stbi__get16be(z->s);    // flags1
-                z->app14_color_transform = stbi__get8(z->s);    // color transform
+                stbi__get8(z->s);                            // version
+                stbi__get16be(z->s);                         // flags0
+                stbi__get16be(z->s);                         // flags1
+                z->app14_color_transform = stbi__get8(z->s); // color transform
                 L -= 6;
             }
         }
@@ -3028,24 +3021,24 @@ static int stbi__process_scan_header(stbi__jpeg* z)
     int i;
     int Ls = stbi__get16be(z->s);
     z->scan_n = stbi__get8(z->s);
-    if(z->scan_n < 1 || z->scan_n > 4 || z->scan_n > ( int )z->s->img_n)
+    if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n)
         return stbi__err("bad SOS component count", "Corrupt JPEG");
-    if(Ls != 6 + 2 * z->scan_n)
+    if (Ls != 6 + 2 * z->scan_n)
         return stbi__err("bad SOS len", "Corrupt JPEG");
-    for(i = 0; i < z->scan_n; ++i)
+    for (i = 0; i < z->scan_n; ++i)
     {
         int id = stbi__get8(z->s), which;
         int q = stbi__get8(z->s);
-        for(which = 0; which < z->s->img_n; ++which)
-            if(z->img_comp[which].id == id)
+        for (which = 0; which < z->s->img_n; ++which)
+            if (z->img_comp[which].id == id)
                 break;
-        if(which == z->s->img_n)
-            return 0;    // no match
+        if (which == z->s->img_n)
+            return 0; // no match
         z->img_comp[which].hd = q >> 4;
-        if(z->img_comp[which].hd > 3)
+        if (z->img_comp[which].hd > 3)
             return stbi__err("bad DC huff", "Corrupt JPEG");
         z->img_comp[which].ha = q & 15;
-        if(z->img_comp[which].ha > 3)
+        if (z->img_comp[which].ha > 3)
             return stbi__err("bad AC huff", "Corrupt JPEG");
         z->order[i] = which;
     }
@@ -3053,21 +3046,20 @@ static int stbi__process_scan_header(stbi__jpeg* z)
     {
         int aa;
         z->spec_start = stbi__get8(z->s);
-        z->spec_end = stbi__get8(z->s);    // should be 63, but might be 0
+        z->spec_end = stbi__get8(z->s); // should be 63, but might be 0
         aa = stbi__get8(z->s);
         z->succ_high = (aa >> 4);
         z->succ_low = (aa & 15);
-        if(z->progressive)
+        if (z->progressive)
         {
-            if(z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 ||
-               z->succ_low > 13)
+            if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
                 return stbi__err("bad SOS", "Corrupt JPEG");
         }
         else
         {
-            if(z->spec_start != 0)
+            if (z->spec_start != 0)
                 return stbi__err("bad SOS", "Corrupt JPEG");
-            if(z->succ_high != 0 || z->succ_low != 0)
+            if (z->succ_high != 0 || z->succ_low != 0)
                 return stbi__err("bad SOS", "Corrupt JPEG");
             z->spec_end = 63;
         }
@@ -3079,21 +3071,21 @@ static int stbi__process_scan_header(stbi__jpeg* z)
 static int stbi__free_jpeg_components(stbi__jpeg* z, int ncomp, int why)
 {
     int i;
-    for(i = 0; i < ncomp; ++i)
+    for (i = 0; i < ncomp; ++i)
     {
-        if(z->img_comp[i].raw_data)
+        if (z->img_comp[i].raw_data)
         {
             STBI_FREE(z->img_comp[i].raw_data);
             z->img_comp[i].raw_data = NULL;
             z->img_comp[i].data = NULL;
         }
-        if(z->img_comp[i].raw_coeff)
+        if (z->img_comp[i].raw_coeff)
         {
             STBI_FREE(z->img_comp[i].raw_coeff);
             z->img_comp[i].raw_coeff = 0;
             z->img_comp[i].coeff = 0;
         }
-        if(z->img_comp[i].linebuf)
+        if (z->img_comp[i].linebuf)
         {
             STBI_FREE(z->img_comp[i].linebuf);
             z->img_comp[i].linebuf = NULL;
@@ -3107,62 +3099,62 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan)
     stbi__context* s = z->s;
     int Lf, p, i, q, h_max = 1, v_max = 1, c;
     Lf = stbi__get16be(s);
-    if(Lf < 11)
-        return stbi__err("bad SOF len", "Corrupt JPEG");    // JPEG
+    if (Lf < 11)
+        return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG
     p = stbi__get8(s);
-    if(p != 8)
-        return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only");    // JPEG baseline
+    if (p != 8)
+        return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline
     s->img_y = stbi__get16be(s);
-    if(s->img_y == 0)
+    if (s->img_y == 0)
         return stbi__err(
             "no header height",
-            "JPEG format not supported: delayed height");    // Legal, but we don't handle it--but neither does IJG
+            "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
     s->img_x = stbi__get16be(s);
-    if(s->img_x == 0)
-        return stbi__err("0 width", "Corrupt JPEG");    // JPEG requires
+    if (s->img_x == 0)
+        return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires
     c = stbi__get8(s);
-    if(c != 3 && c != 1 && c != 4)
+    if (c != 3 && c != 1 && c != 4)
         return stbi__err("bad component count", "Corrupt JPEG");
     s->img_n = c;
-    for(i = 0; i < c; ++i)
+    for (i = 0; i < c; ++i)
     {
         z->img_comp[i].data = NULL;
         z->img_comp[i].linebuf = NULL;
     }
 
-    if(Lf != 8 + 3 * s->img_n)
+    if (Lf != 8 + 3 * s->img_n)
         return stbi__err("bad SOF len", "Corrupt JPEG");
 
     z->rgb = 0;
-    for(i = 0; i < s->img_n; ++i)
+    for (i = 0; i < s->img_n; ++i)
     {
         static const unsigned char rgb[3] = {'R', 'G', 'B'};
         z->img_comp[i].id = stbi__get8(s);
-        if(s->img_n == 3 && z->img_comp[i].id == rgb[i])
+        if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
             ++z->rgb;
         q = stbi__get8(s);
         z->img_comp[i].h = (q >> 4);
-        if(!z->img_comp[i].h || z->img_comp[i].h > 4)
+        if (!z->img_comp[i].h || z->img_comp[i].h > 4)
             return stbi__err("bad H", "Corrupt JPEG");
         z->img_comp[i].v = q & 15;
-        if(!z->img_comp[i].v || z->img_comp[i].v > 4)
+        if (!z->img_comp[i].v || z->img_comp[i].v > 4)
             return stbi__err("bad V", "Corrupt JPEG");
         z->img_comp[i].tq = stbi__get8(s);
-        if(z->img_comp[i].tq > 3)
+        if (z->img_comp[i].tq > 3)
             return stbi__err("bad TQ", "Corrupt JPEG");
     }
 
-    if(scan != STBI__SCAN_load)
+    if (scan != STBI__SCAN_load)
         return 1;
 
-    if(!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0))
+    if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0))
         return stbi__err("too large", "Image too large to decode");
 
-    for(i = 0; i < s->img_n; ++i)
+    for (i = 0; i < s->img_n; ++i)
     {
-        if(z->img_comp[i].h > h_max)
+        if (z->img_comp[i].h > h_max)
             h_max = z->img_comp[i].h;
-        if(z->img_comp[i].v > v_max)
+        if (z->img_comp[i].v > v_max)
             v_max = z->img_comp[i].v;
     }
 
@@ -3175,7 +3167,7 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan)
     z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w;
     z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h;
 
-    for(i = 0; i < s->img_n; ++i)
+    for (i = 0; i < s->img_n; ++i)
     {
         // number of effective pixels (e.g. for non-interleaved MCU)
         z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max;
@@ -3193,19 +3185,19 @@ static int stbi__process_frame_header(stbi__jpeg* z, int scan)
         z->img_comp[i].raw_coeff = 0;
         z->img_comp[i].linebuf = NULL;
         z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
-        if(z->img_comp[i].raw_data == NULL)
+        if (z->img_comp[i].raw_data == NULL)
             return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory"));
         // align blocks for idct using mmx/sse
-        z->img_comp[i].data = ( stbi_uc* )((( size_t )z->img_comp[i].raw_data + 15) & ~15);
-        if(z->progressive)
+        z->img_comp[i].data = (stbi_uc*)(((size_t)z->img_comp[i].raw_data + 15) & ~15);
+        if (z->progressive)
         {
             // w2, h2 are multiples of 8 (see above)
             z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
             z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
             z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
-            if(z->img_comp[i].raw_coeff == NULL)
+            if (z->img_comp[i].raw_coeff == NULL)
                 return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory"));
-            z->img_comp[i].coeff = ( short* )((( size_t )z->img_comp[i].raw_coeff + 15) & ~15);
+            z->img_comp[i].coeff = (short*)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15);
         }
     }
 
@@ -3225,29 +3217,29 @@ static int stbi__decode_jpeg_header(stbi__jpeg* z, int scan)
 {
     int m;
     z->jfif = 0;
-    z->app14_color_transform = -1;    // valid values are 0,1,2
-    z->marker = STBI__MARKER_none;    // initialize cached marker to empty
+    z->app14_color_transform = -1; // valid values are 0,1,2
+    z->marker = STBI__MARKER_none; // initialize cached marker to empty
     m = stbi__get_marker(z);
-    if(!stbi__SOI(m))
+    if (!stbi__SOI(m))
         return stbi__err("no SOI", "Corrupt JPEG");
-    if(scan == STBI__SCAN_type)
+    if (scan == STBI__SCAN_type)
         return 1;
     m = stbi__get_marker(z);
-    while(!stbi__SOF(m))
+    while (!stbi__SOF(m))
     {
-        if(!stbi__process_marker(z, m))
+        if (!stbi__process_marker(z, m))
             return 0;
         m = stbi__get_marker(z);
-        while(m == STBI__MARKER_none)
+        while (m == STBI__MARKER_none)
         {
             // some files have extra padding after their blocks, so ok, we'll scan
-            if(stbi__at_eof(z->s))
+            if (stbi__at_eof(z->s))
                 return stbi__err("no SOF", "Corrupt JPEG");
             m = stbi__get_marker(z);
         }
     }
     z->progressive = stbi__SOF_progressive(m);
-    if(!stbi__process_frame_header(z, scan))
+    if (!stbi__process_frame_header(z, scan))
         return 0;
     return 1;
 }
@@ -3256,30 +3248,30 @@ static int stbi__decode_jpeg_header(stbi__jpeg* z, int scan)
 static int stbi__decode_jpeg_image(stbi__jpeg* j)
 {
     int m;
-    for(m = 0; m < 4; m++)
+    for (m = 0; m < 4; m++)
     {
         j->img_comp[m].raw_data = NULL;
         j->img_comp[m].raw_coeff = NULL;
     }
     j->restart_interval = 0;
-    if(!stbi__decode_jpeg_header(j, STBI__SCAN_load))
+    if (!stbi__decode_jpeg_header(j, STBI__SCAN_load))
         return 0;
     m = stbi__get_marker(j);
-    while(!stbi__EOI(m))
+    while (!stbi__EOI(m))
     {
-        if(stbi__SOS(m))
+        if (stbi__SOS(m))
         {
-            if(!stbi__process_scan_header(j))
+            if (!stbi__process_scan_header(j))
                 return 0;
-            if(!stbi__parse_entropy_coded_data(j))
+            if (!stbi__parse_entropy_coded_data(j))
                 return 0;
-            if(j->marker == STBI__MARKER_none)
+            if (j->marker == STBI__MARKER_none)
             {
                 // handle 0s at the end of image data from IP Kamera 9060
-                while(!stbi__at_eof(j->s))
+                while (!stbi__at_eof(j->s))
                 {
                     int x = stbi__get8(j->s);
-                    if(x == 255)
+                    if (x == 255)
                     {
                         j->marker = stbi__get8(j->s);
                         break;
@@ -3289,23 +3281,23 @@ static int stbi__decode_jpeg_image(stbi__jpeg* j)
                 // return 0
             }
         }
-        else if(stbi__DNL(m))
+        else if (stbi__DNL(m))
         {
             int Ld = stbi__get16be(j->s);
             stbi__uint32 NL = stbi__get16be(j->s);
-            if(Ld != 4)
+            if (Ld != 4)
                 return stbi__err("bad DNL len", "Corrupt JPEG");
-            if(NL != j->s->img_y)
+            if (NL != j->s->img_y)
                 return stbi__err("bad DNL height", "Corrupt JPEG");
         }
         else
         {
-            if(!stbi__process_marker(j, m))
+            if (!stbi__process_marker(j, m))
                 return 0;
         }
         m = stbi__get_marker(j);
     }
-    if(j->progressive)
+    if (j->progressive)
         stbi__jpeg_finish(j);
     return 1;
 }
@@ -3330,7 +3322,7 @@ static stbi_uc* stbi__resample_row_v_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
     // need to generate two samples vertically for every one in input
     int i;
     STBI_NOTUSED(hs);
-    for(i = 0; i < w; ++i)
+    for (i = 0; i < w; ++i)
         out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2);
     return out;
 }
@@ -3341,7 +3333,7 @@ static stbi_uc* stbi__resample_row_h_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
     int i;
     stbi_uc* input = in_near;
 
-    if(w == 1)
+    if (w == 1)
     {
         // if only one sample, can't do any interpolation
         out[0] = out[1] = input[0];
@@ -3350,7 +3342,7 @@ static stbi_uc* stbi__resample_row_h_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
 
     out[0] = input[0];
     out[1] = stbi__div4(input[0] * 3 + input[1] + 2);
-    for(i = 1; i < w - 1; ++i)
+    for (i = 1; i < w - 1; ++i)
     {
         int n = 3 * input[i] + 2;
         out[i * 2 + 0] = stbi__div4(n + input[i - 1]);
@@ -3371,7 +3363,7 @@ static stbi_uc* stbi__resample_row_hv_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
 {
     // need to generate 2x2 samples for every one in input
     int i, t0, t1;
-    if(w == 1)
+    if (w == 1)
     {
         out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
         return out;
@@ -3379,7 +3371,7 @@ static stbi_uc* stbi__resample_row_hv_2(stbi_uc* out, stbi_uc* in_near, stbi_uc*
 
     t1 = 3 * in_near[0] + in_far[0];
     out[0] = stbi__div4(t1 + 2);
-    for(i = 1; i < w; ++i)
+    for (i = 1; i < w; ++i)
     {
         t0 = t1;
         t1 = 3 * in_near[i] + in_far[i];
@@ -3399,7 +3391,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
     // need to generate 2x2 samples for every one in input
     int i = 0, t0, t1;
 
-    if(w == 1)
+    if (w == 1)
     {
         out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
         return out;
@@ -3409,19 +3401,19 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
     // process groups of 8 pixels for as long as we can.
     // note we can't handle the last pixel in a row in this loop
     // because we need to handle the filter boundary conditions.
-    for(; i < ((w - 1) & ~7); i += 8)
+    for (; i < ((w - 1) & ~7); i += 8)
     {
 #if defined(STBI_SSE2)
         // load and perform the vertical filtering pass
         // this uses 3*x + y = 4*x + (y - x)
         __m128i zero = _mm_setzero_si128();
-        __m128i farb = _mm_loadl_epi64(( __m128i* )(in_far + i));
-        __m128i nearb = _mm_loadl_epi64(( __m128i* )(in_near + i));
+        __m128i farb = _mm_loadl_epi64((__m128i*)(in_far + i));
+        __m128i nearb = _mm_loadl_epi64((__m128i*)(in_near + i));
         __m128i farw = _mm_unpacklo_epi8(farb, zero);
         __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
         __m128i diff = _mm_sub_epi16(farw, nearw);
         __m128i nears = _mm_slli_epi16(nearw, 2);
-        __m128i curr = _mm_add_epi16(nears, diff);    // current row
+        __m128i curr = _mm_add_epi16(nears, diff); // current row
 
         // horizontal filter works the same based on shifted vers of current
         // row. "prev" is current row shifted right by 1 pixel; we need to
@@ -3453,7 +3445,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
 
         // pack and write output
         __m128i outv = _mm_packus_epi16(de0, de1);
-        _mm_storeu_si128(( __m128i* )(out + i * 2), outv);
+        _mm_storeu_si128((__m128i*)(out + i * 2), outv);
 #elif defined(STBI_NEON)
         // load and perform the vertical filtering pass
         // this uses 3*x + y = 4*x + (y - x)
@@ -3461,7 +3453,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
         uint8x8_t nearb = vld1_u8(in_near + i);
         int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
         int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
-        int16x8_t curr = vaddq_s16(nears, diff);    // current row
+        int16x8_t curr = vaddq_s16(nears, diff); // current row
 
         // horizontal filter works the same based on shifted vers of current
         // row. "prev" is current row shifted right by 1 pixel; we need to
@@ -3498,7 +3490,7 @@ static stbi_uc* stbi__resample_row_hv_2_simd(stbi_uc* out, stbi_uc* in_near, stb
     t1 = 3 * in_near[i] + in_far[i];
     out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
 
-    for(++i; i < w; ++i)
+    for (++i; i < w; ++i)
     {
         t0 = t1;
         t1 = 3 * in_near[i] + in_far[i];
@@ -3518,22 +3510,22 @@ static stbi_uc* stbi__resample_row_generic(stbi_uc* out, stbi_uc* in_near, stbi_
     // resample with nearest-neighbor
     int i, j;
     STBI_NOTUSED(in_far);
-    for(i = 0; i < w; ++i)
-        for(j = 0; j < hs; ++j)
+    for (i = 0; i < w; ++i)
+        for (j = 0; j < hs; ++j)
             out[i * hs + j] = in_near[i];
     return out;
 }
 
 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 // to make sure the code produces the same results in both SIMD and scalar
-#define stbi__float2fixed(x) ((( int )(( x )*4096.0f + 0.5f)) << 8)
+#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
 static void stbi__YCbCr_to_RGB_row(stbi_uc* out, const stbi_uc* y, const stbi_uc* pcb, const stbi_uc* pcr, int count,
                                    int step)
 {
     int i;
-    for(i = 0; i < count; ++i)
+    for (i = 0; i < count; ++i)
     {
-        int y_fixed = (y[i] << 20) + (1 << 19);    // rounding
+        int y_fixed = (y[i] << 20) + (1 << 19); // rounding
         int r, g, b;
         int cr = pcr[i] - 128;
         int cb = pcb[i] - 128;
@@ -3543,30 +3535,30 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc* out, const stbi_uc* y, const stbi_uc
         r >>= 20;
         g >>= 20;
         b >>= 20;
-        if(( unsigned )r > 255)
+        if ((unsigned)r > 255)
         {
-            if(r < 0)
+            if (r < 0)
                 r = 0;
             else
                 r = 255;
         }
-        if(( unsigned )g > 255)
+        if ((unsigned)g > 255)
         {
-            if(g < 0)
+            if (g < 0)
                 g = 0;
             else
                 g = 255;
         }
-        if(( unsigned )b > 255)
+        if ((unsigned)b > 255)
         {
-            if(b < 0)
+            if (b < 0)
                 b = 0;
             else
                 b = 255;
         }
-        out[0] = ( stbi_uc )r;
-        out[1] = ( stbi_uc )g;
-        out[2] = ( stbi_uc )b;
+        out[0] = (stbi_uc)r;
+        out[1] = (stbi_uc)g;
+        out[2] = (stbi_uc)b;
         out[3] = 255;
         out += step;
     }
@@ -3582,25 +3574,25 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
     // step == 3 is pretty ugly on the final interleave, and i'm not convinced
     // it's useful in practice (you wouldn't use it for textures, for example).
     // so just accelerate step == 4 case.
-    if(step == 4)
+    if (step == 4)
     {
         // this is a fairly straightforward implementation and not super-optimized.
         __m128i signflip = _mm_set1_epi8(-0x80);
-        __m128i cr_const0 = _mm_set1_epi16(( short )(1.40200f * 4096.0f + 0.5f));
-        __m128i cr_const1 = _mm_set1_epi16(-( short )(0.71414f * 4096.0f + 0.5f));
-        __m128i cb_const0 = _mm_set1_epi16(-( short )(0.34414f * 4096.0f + 0.5f));
-        __m128i cb_const1 = _mm_set1_epi16(( short )(1.77200f * 4096.0f + 0.5f));
-        __m128i y_bias = _mm_set1_epi8(( char )( unsigned char )128);
-        __m128i xw = _mm_set1_epi16(255);    // alpha channel
-
-        for(; i + 7 < count; i += 8)
+        __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
+        __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
+        __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
+        __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
+        __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
+        __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+        for (; i + 7 < count; i += 8)
         {
             // load
-            __m128i y_bytes = _mm_loadl_epi64(( __m128i* )(y + i));
-            __m128i cr_bytes = _mm_loadl_epi64(( __m128i* )(pcr + i));
-            __m128i cb_bytes = _mm_loadl_epi64(( __m128i* )(pcb + i));
-            __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip);    // -128
-            __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip);    // -128
+            __m128i y_bytes = _mm_loadl_epi64((__m128i*)(y + i));
+            __m128i cr_bytes = _mm_loadl_epi64((__m128i*)(pcr + i));
+            __m128i cb_bytes = _mm_loadl_epi64((__m128i*)(pcb + i));
+            __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+            __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
 
             // unpack to short (and left-shift cr, cb by 8)
             __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
@@ -3634,8 +3626,8 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
             __m128i o1 = _mm_unpackhi_epi16(t0, t1);
 
             // store
-            _mm_storeu_si128(( __m128i* )(out + 0), o0);
-            _mm_storeu_si128(( __m128i* )(out + 16), o1);
+            _mm_storeu_si128((__m128i*)(out + 0), o0);
+            _mm_storeu_si128((__m128i*)(out + 16), o1);
             out += 32;
         }
     }
@@ -3643,16 +3635,16 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
 
 #ifdef STBI_NEON
     // in this version, step=3 support would be easy to add. but is there demand?
-    if(step == 4)
+    if (step == 4)
     {
         // this is a fairly straightforward implementation and not super-optimized.
         uint8x8_t signflip = vdup_n_u8(0x80);
-        int16x8_t cr_const0 = vdupq_n_s16(( short )(1.40200f * 4096.0f + 0.5f));
-        int16x8_t cr_const1 = vdupq_n_s16(-( short )(0.71414f * 4096.0f + 0.5f));
-        int16x8_t cb_const0 = vdupq_n_s16(-( short )(0.34414f * 4096.0f + 0.5f));
-        int16x8_t cb_const1 = vdupq_n_s16(( short )(1.77200f * 4096.0f + 0.5f));
+        int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
+        int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
+        int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
+        int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
 
-        for(; i + 7 < count; i += 8)
+        for (; i + 7 < count; i += 8)
         {
             // load
             uint8x8_t y_bytes = vld1_u8(y + i);
@@ -3689,9 +3681,9 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
     }
 #endif
 
-    for(; i < count; ++i)
+    for (; i < count; ++i)
     {
-        int y_fixed = (y[i] << 20) + (1 << 19);    // rounding
+        int y_fixed = (y[i] << 20) + (1 << 19); // rounding
         int r, g, b;
         int cr = pcr[i] - 128;
         int cb = pcb[i] - 128;
@@ -3701,30 +3693,30 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc* out, stbi_uc const* y, stbi_uc cons
         r >>= 20;
         g >>= 20;
         b >>= 20;
-        if(( unsigned )r > 255)
+        if ((unsigned)r > 255)
         {
-            if(r < 0)
+            if (r < 0)
                 r = 0;
             else
                 r = 255;
         }
-        if(( unsigned )g > 255)
+        if ((unsigned)g > 255)
         {
-            if(g < 0)
+            if (g < 0)
                 g = 0;
             else
                 g = 255;
         }
-        if(( unsigned )b > 255)
+        if ((unsigned)b > 255)
         {
-            if(b < 0)
+            if (b < 0)
                 b = 0;
             else
                 b = 255;
         }
-        out[0] = ( stbi_uc )r;
-        out[1] = ( stbi_uc )g;
-        out[2] = ( stbi_uc )b;
+        out[0] = (stbi_uc)r;
+        out[1] = (stbi_uc)g;
+        out[2] = (stbi_uc)b;
         out[3] = 255;
         out += step;
     }
@@ -3739,7 +3731,7 @@ static void stbi__setup_jpeg(stbi__jpeg* j)
     j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
 
 #ifdef STBI_SSE2
-    if(stbi__sse2_available())
+    if (stbi__sse2_available())
     {
         j->idct_block_kernel = stbi__idct_simd;
         j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
@@ -3764,9 +3756,9 @@ typedef struct
 {
     resample_row_func resample;
     stbi_uc *line0, *line1;
-    int hs, vs;    // expansion factor in each axis
-    int w_lores;    // horizontal pixels pre-expansion
-    int ystep;    // how far through vertical expansion we are
+    int hs, vs;  // expansion factor in each axis
+    int w_lores; // horizontal pixels pre-expansion
+    int ystep;   // how far through vertical expansion we are
     int ypos;    // which pre-expansion row we're on
 } stbi__resample;
 
@@ -3780,25 +3772,26 @@ static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
 static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp, int req_comp)
 {
     int n, decode_n, is_rgb;
-    z->s->img_n = 0;    // make stbi__cleanup_jpeg safe
+    z->s->img_n = 0; // make stbi__cleanup_jpeg safe
 
     // validate req_comp
-    if(req_comp < 0 || req_comp > 4)
+    if (req_comp < 0 || req_comp > 4)
         return stbi__errpuc("bad req_comp", "Internal error");
 
     // load a jpeg image from whichever source, but leave in YCbCr format
-    if(!stbi__decode_jpeg_image(z))
+    if (!stbi__decode_jpeg_image(z))
     {
         stbi__cleanup_jpeg(z);
         return NULL;
     }
 
     // determine actual number of components to generate
-    n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+    n = req_comp ? req_comp : z->s->img_n >= 3 ? 3
+                                               : 1;
 
     is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
 
-    if(z->s->img_n == 3 && n < 3 && !is_rgb)
+    if (z->s->img_n == 3 && n < 3 && !is_rgb)
         decode_n = 1;
     else
         decode_n = z->s->img_n;
@@ -3812,14 +3805,14 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
 
         stbi__resample res_comp[4];
 
-        for(k = 0; k < decode_n; ++k)
+        for (k = 0; k < decode_n; ++k)
         {
             stbi__resample* r = &res_comp[k];
 
             // allocate line buffer big enough for upsampling off the edges
             // with upsample factor of 4
-            z->img_comp[k].linebuf = ( stbi_uc* )stbi__malloc(z->s->img_x + 3);
-            if(!z->img_comp[k].linebuf)
+            z->img_comp[k].linebuf = (stbi_uc*)stbi__malloc(z->s->img_x + 3);
+            if (!z->img_comp[k].linebuf)
             {
                 stbi__cleanup_jpeg(z);
                 return stbi__errpuc("outofmem", "Out of memory");
@@ -3832,52 +3825,52 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
             r->ypos = 0;
             r->line0 = r->line1 = z->img_comp[k].data;
 
-            if(r->hs == 1 && r->vs == 1)
+            if (r->hs == 1 && r->vs == 1)
                 r->resample = resample_row_1;
-            else if(r->hs == 1 && r->vs == 2)
+            else if (r->hs == 1 && r->vs == 2)
                 r->resample = stbi__resample_row_v_2;
-            else if(r->hs == 2 && r->vs == 1)
+            else if (r->hs == 2 && r->vs == 1)
                 r->resample = stbi__resample_row_h_2;
-            else if(r->hs == 2 && r->vs == 2)
+            else if (r->hs == 2 && r->vs == 2)
                 r->resample = z->resample_row_hv_2_kernel;
             else
                 r->resample = stbi__resample_row_generic;
         }
 
         // can't error after this so, this is safe
-        output = ( stbi_uc* )stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
-        if(!output)
+        output = (stbi_uc*)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+        if (!output)
         {
             stbi__cleanup_jpeg(z);
             return stbi__errpuc("outofmem", "Out of memory");
         }
 
         // now go ahead and resample
-        for(j = 0; j < z->s->img_y; ++j)
+        for (j = 0; j < z->s->img_y; ++j)
         {
             stbi_uc* out = output + n * z->s->img_x * j;
-            for(k = 0; k < decode_n; ++k)
+            for (k = 0; k < decode_n; ++k)
             {
                 stbi__resample* r = &res_comp[k];
                 int y_bot = r->ystep >= (r->vs >> 1);
                 coutput[k] = r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0,
                                          y_bot ? r->line0 : r->line1, r->w_lores, r->hs);
-                if(++r->ystep >= r->vs)
+                if (++r->ystep >= r->vs)
                 {
                     r->ystep = 0;
                     r->line0 = r->line1;
-                    if(++r->ypos < z->img_comp[k].y)
+                    if (++r->ypos < z->img_comp[k].y)
                         r->line1 += z->img_comp[k].w2;
                 }
             }
-            if(n >= 3)
+            if (n >= 3)
             {
                 stbi_uc* y = coutput[0];
-                if(z->s->img_n == 3)
+                if (z->s->img_n == 3)
                 {
-                    if(is_rgb)
+                    if (is_rgb)
                     {
-                        for(i = 0; i < z->s->img_x; ++i)
+                        for (i = 0; i < z->s->img_x; ++i)
                         {
                             out[0] = y[i];
                             out[1] = coutput[1][i];
@@ -3891,11 +3884,11 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                         z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
                     }
                 }
-                else if(z->s->img_n == 4)
+                else if (z->s->img_n == 4)
                 {
-                    if(z->app14_color_transform == 0)
-                    {    // CMYK
-                        for(i = 0; i < z->s->img_x; ++i)
+                    if (z->app14_color_transform == 0)
+                    { // CMYK
+                        for (i = 0; i < z->s->img_x; ++i)
                         {
                             stbi_uc m = coutput[3][i];
                             out[0] = stbi__blinn_8x8(coutput[0][i], m);
@@ -3905,10 +3898,10 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                             out += n;
                         }
                     }
-                    else if(z->app14_color_transform == 2)
-                    {    // YCCK
+                    else if (z->app14_color_transform == 2)
+                    { // YCCK
                         z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-                        for(i = 0; i < z->s->img_x; ++i)
+                        for (i = 0; i < z->s->img_x; ++i)
                         {
                             stbi_uc m = coutput[3][i];
                             out[0] = stbi__blinn_8x8(255 - out[0], m);
@@ -3918,37 +3911,37 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                         }
                     }
                     else
-                    {    // YCbCr + alpha?  Ignore the fourth channel for now
+                    { // YCbCr + alpha?  Ignore the fourth channel for now
                         z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
                     }
                 }
                 else
-                    for(i = 0; i < z->s->img_x; ++i)
+                    for (i = 0; i < z->s->img_x; ++i)
                     {
                         out[0] = out[1] = out[2] = y[i];
-                        out[3] = 255;    // not used if n==3
+                        out[3] = 255; // not used if n==3
                         out += n;
                     }
             }
             else
             {
-                if(is_rgb)
+                if (is_rgb)
                 {
-                    if(n == 1)
-                        for(i = 0; i < z->s->img_x; ++i)
+                    if (n == 1)
+                        for (i = 0; i < z->s->img_x; ++i)
                             *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
                     else
                     {
-                        for(i = 0; i < z->s->img_x; ++i, out += 2)
+                        for (i = 0; i < z->s->img_x; ++i, out += 2)
                         {
                             out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
                             out[1] = 255;
                         }
                     }
                 }
-                else if(z->s->img_n == 4 && z->app14_color_transform == 0)
+                else if (z->s->img_n == 4 && z->app14_color_transform == 0)
                 {
-                    for(i = 0; i < z->s->img_x; ++i)
+                    for (i = 0; i < z->s->img_x; ++i)
                     {
                         stbi_uc m = coutput[3][i];
                         stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
@@ -3959,9 +3952,9 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                         out += n;
                     }
                 }
-                else if(z->s->img_n == 4 && z->app14_color_transform == 2)
+                else if (z->s->img_n == 4 && z->app14_color_transform == 2)
                 {
-                    for(i = 0; i < z->s->img_x; ++i)
+                    for (i = 0; i < z->s->img_x; ++i)
                     {
                         out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
                         out[1] = 255;
@@ -3971,11 +3964,11 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
                 else
                 {
                     stbi_uc* y = coutput[0];
-                    if(n == 1)
-                        for(i = 0; i < z->s->img_x; ++i)
+                    if (n == 1)
+                        for (i = 0; i < z->s->img_x; ++i)
                             out[i] = y[i];
                     else
-                        for(i = 0; i < z->s->img_x; ++i)
+                        for (i = 0; i < z->s->img_x; ++i)
                             *out++ = y[i], *out++ = 255;
                 }
             }
@@ -3983,8 +3976,8 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
         stbi__cleanup_jpeg(z);
         *out_x = z->s->img_x;
         *out_y = z->s->img_y;
-        if(comp)
-            *comp = z->s->img_n >= 3 ? 3 : 1;    // report original components, not output
+        if (comp)
+            *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
         return output;
     }
 }
@@ -3992,7 +3985,7 @@ static stbi_uc* load_jpeg_image(stbi__jpeg* z, int* out_x, int* out_y, int* comp
 static void* stbi__jpeg_load(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri)
 {
     unsigned char* result;
-    stbi__jpeg* j = ( stbi__jpeg* )stbi__malloc(sizeof(stbi__jpeg));
+    stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
     STBI_NOTUSED(ri);
     j->s = s;
     stbi__setup_jpeg(j);
@@ -4004,7 +3997,7 @@ static void* stbi__jpeg_load(stbi__context* s, int* x, int* y, int* comp, int re
 static int stbi__jpeg_test(stbi__context* s)
 {
     int r;
-    stbi__jpeg* j = ( stbi__jpeg* )stbi__malloc(sizeof(stbi__jpeg));
+    stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
     j->s = s;
     stbi__setup_jpeg(j);
     r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
@@ -4015,16 +4008,16 @@ static int stbi__jpeg_test(stbi__context* s)
 
 static int stbi__jpeg_info_raw(stbi__jpeg* j, int* x, int* y, int* comp)
 {
-    if(!stbi__decode_jpeg_header(j, STBI__SCAN_header))
+    if (!stbi__decode_jpeg_header(j, STBI__SCAN_header))
     {
         stbi__rewind(j->s);
         return 0;
     }
-    if(x)
+    if (x)
         *x = j->s->img_x;
-    if(y)
+    if (y)
         *y = j->s->img_y;
-    if(comp)
+    if (comp)
         *comp = j->s->img_n >= 3 ? 3 : 1;
     return 1;
 }
@@ -4032,7 +4025,7 @@ static int stbi__jpeg_info_raw(stbi__jpeg* j, int* x, int* y, int* comp)
 static int stbi__jpeg_info(stbi__context* s, int* x, int* y, int* comp)
 {
     int result;
-    stbi__jpeg* j = ( stbi__jpeg* )(stbi__malloc(sizeof(stbi__jpeg)));
+    stbi__jpeg* j = (stbi__jpeg*)(stbi__malloc(sizeof(stbi__jpeg)));
     j->s = s;
     result = stbi__jpeg_info_raw(j, x, y, comp);
     STBI_FREE(j);
@@ -4050,7 +4043,7 @@ static int stbi__jpeg_info(stbi__context* s, int* x, int* y, int* comp)
 #ifndef STBI_NO_ZLIB
 
 // fast-way is faster to check than jpeg huffman, but slow way is slower
-#define STBI__ZFAST_BITS 9    // accelerate all cases in default tables
+#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables
 #define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1)
 
 // zlib-style huffman encoding
@@ -4090,40 +4083,40 @@ static int stbi__zbuild_huffman(stbi__zhuffman* z, const stbi_uc* sizelist, int
     // DEFLATE spec for generating codes
     memset(sizes, 0, sizeof(sizes));
     memset(z->fast, 0, sizeof(z->fast));
-    for(i = 0; i < num; ++i)
+    for (i = 0; i < num; ++i)
         ++sizes[sizelist[i]];
     sizes[0] = 0;
-    for(i = 1; i < 16; ++i)
-        if(sizes[i] > (1 << i))
+    for (i = 1; i < 16; ++i)
+        if (sizes[i] > (1 << i))
             return stbi__err("bad sizes", "Corrupt PNG");
     code = 0;
-    for(i = 1; i < 16; ++i)
+    for (i = 1; i < 16; ++i)
     {
         next_code[i] = code;
-        z->firstcode[i] = ( stbi__uint16 )code;
-        z->firstsymbol[i] = ( stbi__uint16 )k;
+        z->firstcode[i] = (stbi__uint16)code;
+        z->firstsymbol[i] = (stbi__uint16)k;
         code = (code + sizes[i]);
-        if(sizes[i])
-            if(code - 1 >= (1 << i))
+        if (sizes[i])
+            if (code - 1 >= (1 << i))
                 return stbi__err("bad codelengths", "Corrupt PNG");
-        z->maxcode[i] = code << (16 - i);    // preshift for inner loop
+        z->maxcode[i] = code << (16 - i); // preshift for inner loop
         code <<= 1;
         k += sizes[i];
     }
-    z->maxcode[16] = 0x10000;    // sentinel
-    for(i = 0; i < num; ++i)
+    z->maxcode[16] = 0x10000; // sentinel
+    for (i = 0; i < num; ++i)
     {
         int s = sizelist[i];
-        if(s)
+        if (s)
         {
             int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
             stbi__uint16 fastv = (stbi__uint16)((s << 9) | i);
-            z->size[c] = ( stbi_uc )s;
-            z->value[c] = ( stbi__uint16 )i;
-            if(s <= STBI__ZFAST_BITS)
+            z->size[c] = (stbi_uc)s;
+            z->value[c] = (stbi__uint16)i;
+            if (s <= STBI__ZFAST_BITS)
             {
                 int j = stbi__bit_reverse(next_code[s], s);
-                while(j < (1 << STBI__ZFAST_BITS))
+                while (j < (1 << STBI__ZFAST_BITS))
                 {
                     z->fast[j] = fastv;
                     j += (1 << s);
@@ -4157,7 +4150,7 @@ typedef struct
 
 stbi_inline static stbi_uc stbi__zget8(stbi__zbuf* z)
 {
-    if(z->zbuffer >= z->zbuffer_end)
+    if (z->zbuffer >= z->zbuffer_end)
         return 0;
     return *z->zbuffer++;
 }
@@ -4167,15 +4160,15 @@ static void stbi__fill_bits(stbi__zbuf* z)
     do
     {
         STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
-        z->code_buffer |= ( unsigned int )stbi__zget8(z) << z->num_bits;
+        z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits;
         z->num_bits += 8;
-    } while(z->num_bits <= 24);
+    } while (z->num_bits <= 24);
 }
 
 stbi_inline static unsigned int stbi__zreceive(stbi__zbuf* z, int n)
 {
     unsigned int k;
-    if(z->num_bits < n)
+    if (z->num_bits < n)
         stbi__fill_bits(z);
     k = z->code_buffer & ((1 << n) - 1);
     z->code_buffer >>= n;
@@ -4189,11 +4182,11 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf* a, stbi__zhuffman* z)
     // not resolved by fast table, so compute it the slow way
     // use jpeg approach, which requires MSbits at top
     k = stbi__bit_reverse(a->code_buffer, 16);
-    for(s = STBI__ZFAST_BITS + 1;; ++s)
-        if(k < z->maxcode[s])
+    for (s = STBI__ZFAST_BITS + 1;; ++s)
+        if (k < z->maxcode[s])
             break;
-    if(s == 16)
-        return -1;    // invalid code!
+    if (s == 16)
+        return -1; // invalid code!
     // code size is s, so:
     b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
     STBI_ASSERT(z->size[b] == s);
@@ -4205,10 +4198,10 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf* a, stbi__zhuffman* z)
 stbi_inline static int stbi__zhuffman_decode(stbi__zbuf* a, stbi__zhuffman* z)
 {
     int b, s;
-    if(a->num_bits < 16)
+    if (a->num_bits < 16)
         stbi__fill_bits(a);
     b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
-    if(b)
+    if (b)
     {
         s = b >> 9;
         a->code_buffer >>= s;
@@ -4218,20 +4211,20 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf* a, stbi__zhuffman* z)
     return stbi__zhuffman_decode_slowpath(a, z);
 }
 
-static int stbi__zexpand(stbi__zbuf* z, char* zout, int n)    // need to make room for n bytes
+static int stbi__zexpand(stbi__zbuf* z, char* zout, int n) // need to make room for n bytes
 {
     char* q;
     int cur, limit, old_limit;
     z->zout = zout;
-    if(!z->z_expandable)
+    if (!z->z_expandable)
         return stbi__err("output buffer limit", "Corrupt PNG");
-    cur = ( int )(z->zout - z->zout_start);
-    limit = old_limit = ( int )(z->zout_end - z->zout_start);
-    while(cur + n > limit)
+    cur = (int)(z->zout - z->zout_start);
+    limit = old_limit = (int)(z->zout_end - z->zout_start);
+    while (cur + n > limit)
         limit *= 2;
-    q = ( char* )STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+    q = (char*)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
     STBI_NOTUSED(old_limit);
-    if(q == NULL)
+    if (q == NULL)
         return stbi__err("outofmem", "Out of memory");
     z->zout_start = q;
     z->zout = q + cur;
@@ -4239,82 +4232,82 @@ static int stbi__zexpand(stbi__zbuf* z, char* zout, int n)    // need to make ro
     return 1;
 }
 
-static const int stbi__zlength_base[31] = {3,  4,  5,  6,  7,  8,  9,  10,  11,  13,  15,  17,  19,  23, 27, 31,
-                                           35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0,  0};
+static const int stbi__zlength_base[31] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+                                           35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
 
 static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
                                             3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0};
 
-static const int stbi__zdist_base[32] = {1,    2,    3,    4,    5,    7,     9,     13,    17,  25,   33,
-                                         49,   65,   97,   129,  193,  257,   385,   513,   769, 1025, 1537,
-                                         2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0,   0};
+static const int stbi__zdist_base[32] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33,
+                                         49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537,
+                                         2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0};
 
-static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2,  3,  3,  4,  4,  5,  5,  6,
+static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
                                           6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
 
 static int stbi__parse_huffman_block(stbi__zbuf* a)
 {
     char* zout = a->zout;
-    for(;;)
+    for (;;)
     {
         int z = stbi__zhuffman_decode(a, &a->z_length);
-        if(z < 256)
+        if (z < 256)
         {
-            if(z < 0)
-                return stbi__err("bad huffman code", "Corrupt PNG");    // error in huffman codes
-            if(zout >= a->zout_end)
+            if (z < 0)
+                return stbi__err("bad huffman code", "Corrupt PNG"); // error in huffman codes
+            if (zout >= a->zout_end)
             {
-                if(!stbi__zexpand(a, zout, 1))
+                if (!stbi__zexpand(a, zout, 1))
                     return 0;
                 zout = a->zout;
             }
-            *zout++ = ( char )z;
+            *zout++ = (char)z;
         }
         else
         {
             stbi_uc* p;
             int len, dist;
-            if(z == 256)
+            if (z == 256)
             {
                 a->zout = zout;
                 return 1;
             }
             z -= 257;
             len = stbi__zlength_base[z];
-            if(stbi__zlength_extra[z])
+            if (stbi__zlength_extra[z])
                 len += stbi__zreceive(a, stbi__zlength_extra[z]);
             z = stbi__zhuffman_decode(a, &a->z_distance);
-            if(z < 0)
+            if (z < 0)
                 return stbi__err("bad huffman code", "Corrupt PNG");
             dist = stbi__zdist_base[z];
-            if(stbi__zdist_extra[z])
+            if (stbi__zdist_extra[z])
                 dist += stbi__zreceive(a, stbi__zdist_extra[z]);
-            if(zout - a->zout_start < dist)
+            if (zout - a->zout_start < dist)
                 return stbi__err("bad dist", "Corrupt PNG");
-            if(zout + len > a->zout_end)
+            if (zout + len > a->zout_end)
             {
-                if(!stbi__zexpand(a, zout, len))
+                if (!stbi__zexpand(a, zout, len))
                     return 0;
                 zout = a->zout;
             }
-            p = ( stbi_uc* )(zout - dist);
-            if(dist == 1)
-            {    // run of one byte; common in images.
+            p = (stbi_uc*)(zout - dist);
+            if (dist == 1)
+            { // run of one byte; common in images.
                 stbi_uc v = *p;
-                if(len)
+                if (len)
                 {
                     do
                         *zout++ = v;
-                    while(--len);
+                    while (--len);
                 }
             }
             else
             {
-                if(len)
+                if (len)
                 {
                     do
                         *zout++ = *p++;
-                    while(--len);
+                    while (--len);
                 }
             }
         }
@@ -4325,7 +4318,7 @@ static int stbi__compute_huffman_codes(stbi__zbuf* a)
 {
     static const stbi_uc length_dezigzag[19] = {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
     stbi__zhuffman z_codelength;
-    stbi_uc lencodes[286 + 32 + 137];    // padding for maximum single op
+    stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op
     stbi_uc codelength_sizes[19];
     int i, n;
 
@@ -4335,50 +4328,50 @@ static int stbi__compute_huffman_codes(stbi__zbuf* a)
     int ntot = hlit + hdist;
 
     memset(codelength_sizes, 0, sizeof(codelength_sizes));
-    for(i = 0; i < hclen; ++i)
+    for (i = 0; i < hclen; ++i)
     {
         int s = stbi__zreceive(a, 3);
-        codelength_sizes[length_dezigzag[i]] = ( stbi_uc )s;
+        codelength_sizes[length_dezigzag[i]] = (stbi_uc)s;
     }
-    if(!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19))
+    if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19))
         return 0;
 
     n = 0;
-    while(n < ntot)
+    while (n < ntot)
     {
         int c = stbi__zhuffman_decode(a, &z_codelength);
-        if(c < 0 || c >= 19)
+        if (c < 0 || c >= 19)
             return stbi__err("bad codelengths", "Corrupt PNG");
-        if(c < 16)
-            lencodes[n++] = ( stbi_uc )c;
+        if (c < 16)
+            lencodes[n++] = (stbi_uc)c;
         else
         {
             stbi_uc fill = 0;
-            if(c == 16)
+            if (c == 16)
             {
                 c = stbi__zreceive(a, 2) + 3;
-                if(n == 0)
+                if (n == 0)
                     return stbi__err("bad codelengths", "Corrupt PNG");
                 fill = lencodes[n - 1];
             }
-            else if(c == 17)
+            else if (c == 17)
                 c = stbi__zreceive(a, 3) + 3;
             else
             {
                 STBI_ASSERT(c == 18);
                 c = stbi__zreceive(a, 7) + 11;
             }
-            if(ntot - n < c)
+            if (ntot - n < c)
                 return stbi__err("bad codelengths", "Corrupt PNG");
             memset(lencodes + n, fill, c);
             n += c;
         }
     }
-    if(n != ntot)
+    if (n != ntot)
         return stbi__err("bad codelengths", "Corrupt PNG");
-    if(!stbi__zbuild_huffman(&a->z_length, lencodes, hlit))
+    if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit))
         return 0;
-    if(!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist))
+    if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist))
         return 0;
     return 1;
 }
@@ -4387,28 +4380,28 @@ static int stbi__parse_uncompressed_block(stbi__zbuf* a)
 {
     stbi_uc header[4];
     int len, nlen, k;
-    if(a->num_bits & 7)
-        stbi__zreceive(a, a->num_bits & 7);    // discard
+    if (a->num_bits & 7)
+        stbi__zreceive(a, a->num_bits & 7); // discard
     // drain the bit-packed data into header
     k = 0;
-    while(a->num_bits > 0)
+    while (a->num_bits > 0)
     {
-        header[k++] = (stbi_uc)(a->code_buffer & 255);    // suppress MSVC run-time check
+        header[k++] = (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check
         a->code_buffer >>= 8;
         a->num_bits -= 8;
     }
     STBI_ASSERT(a->num_bits == 0);
     // now fill header the normal way
-    while(k < 4)
+    while (k < 4)
         header[k++] = stbi__zget8(a);
     len = header[1] * 256 + header[0];
     nlen = header[3] * 256 + header[2];
-    if(nlen != (len ^ 0xffff))
+    if (nlen != (len ^ 0xffff))
         return stbi__err("zlib corrupt", "Corrupt PNG");
-    if(a->zbuffer + len > a->zbuffer_end)
+    if (a->zbuffer + len > a->zbuffer_end)
         return stbi__err("read past buffer", "Corrupt PNG");
-    if(a->zout + len > a->zout_end)
-        if(!stbi__zexpand(a, a->zout, len))
+    if (a->zout + len > a->zout_end)
+        if (!stbi__zexpand(a, a->zout, len))
             return 0;
     memcpy(a->zout, a->zbuffer, len);
     a->zbuffer += len;
@@ -4422,12 +4415,12 @@ static int stbi__parse_zlib_header(stbi__zbuf* a)
     int cm = cmf & 15;
     /* int cinfo = cmf >> 4; */
     int flg = stbi__zget8(a);
-    if((cmf * 256 + flg) % 31 != 0)
-        return stbi__err("bad zlib header", "Corrupt PNG");    // zlib spec
-    if(flg & 32)
-        return stbi__err("no preset dict", "Corrupt PNG");    // preset dictionary not allowed in png
-    if(cm != 8)
-        return stbi__err("bad compression", "Corrupt PNG");    // DEFLATE required for png
+    if ((cmf * 256 + flg) % 31 != 0)
+        return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
+    if (flg & 32)
+        return stbi__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png
+    if (cm != 8)
+        return stbi__err("bad compression", "Corrupt PNG"); // DEFLATE required for png
     // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
     return 1;
 }
@@ -4459,8 +4452,8 @@ Init algorithm:
 static int stbi__parse_zlib(stbi__zbuf* a, int parse_header)
 {
     int final, type;
-    if(parse_header)
-        if(!stbi__parse_zlib_header(a))
+    if (parse_header)
+        if (!stbi__parse_zlib_header(a))
             return 0;
     a->num_bits = 0;
     a->code_buffer = 0;
@@ -4468,34 +4461,34 @@ static int stbi__parse_zlib(stbi__zbuf* a, int parse_header)
     {
         final = stbi__zreceive(a, 1);
         type = stbi__zreceive(a, 2);
-        if(type == 0)
+        if (type == 0)
         {
-            if(!stbi__parse_uncompressed_block(a))
+            if (!stbi__parse_uncompressed_block(a))
                 return 0;
         }
-        else if(type == 3)
+        else if (type == 3)
         {
             return 0;
         }
         else
         {
-            if(type == 1)
+            if (type == 1)
             {
                 // use fixed code lengths
-                if(!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288))
+                if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288))
                     return 0;
-                if(!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32))
+                if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32))
                     return 0;
             }
             else
             {
-                if(!stbi__compute_huffman_codes(a))
+                if (!stbi__compute_huffman_codes(a))
                     return 0;
             }
-            if(!stbi__parse_huffman_block(a))
+            if (!stbi__parse_huffman_block(a))
                 return 0;
         }
-    } while(!final);
+    } while (!final);
     return 1;
 }
 
@@ -4512,15 +4505,15 @@ static int stbi__do_zlib(stbi__zbuf* a, char* obuf, int olen, int exp, int parse
 extern char* stbi_zlib_decode_malloc_guesssize(const char* buffer, int len, int initial_size, int* outlen)
 {
     stbi__zbuf a;
-    char* p = ( char* )stbi__malloc(initial_size);
-    if(p == NULL)
+    char* p = (char*)stbi__malloc(initial_size);
+    if (p == NULL)
         return NULL;
-    a.zbuffer = ( stbi_uc* )buffer;
-    a.zbuffer_end = ( stbi_uc* )buffer + len;
-    if(stbi__do_zlib(&a, p, initial_size, 1, 1))
+    a.zbuffer = (stbi_uc*)buffer;
+    a.zbuffer_end = (stbi_uc*)buffer + len;
+    if (stbi__do_zlib(&a, p, initial_size, 1, 1))
     {
-        if(outlen)
-            *outlen = ( int )(a.zout - a.zout_start);
+        if (outlen)
+            *outlen = (int)(a.zout - a.zout_start);
         return a.zout_start;
     }
     else
@@ -4539,15 +4532,15 @@ extern char* stbi_zlib_decode_malloc_guesssize_headerflag(const char* buffer, in
                                                           int parse_header)
 {
     stbi__zbuf a;
-    char* p = ( char* )stbi__malloc(initial_size);
-    if(p == NULL)
+    char* p = (char*)stbi__malloc(initial_size);
+    if (p == NULL)
         return NULL;
-    a.zbuffer = ( stbi_uc* )buffer;
-    a.zbuffer_end = ( stbi_uc* )buffer + len;
-    if(stbi__do_zlib(&a, p, initial_size, 1, parse_header))
+    a.zbuffer = (stbi_uc*)buffer;
+    a.zbuffer_end = (stbi_uc*)buffer + len;
+    if (stbi__do_zlib(&a, p, initial_size, 1, parse_header))
     {
-        if(outlen)
-            *outlen = ( int )(a.zout - a.zout_start);
+        if (outlen)
+            *outlen = (int)(a.zout - a.zout_start);
         return a.zout_start;
     }
     else
@@ -4560,10 +4553,10 @@ extern char* stbi_zlib_decode_malloc_guesssize_headerflag(const char* buffer, in
 extern int stbi_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, int ilen)
 {
     stbi__zbuf a;
-    a.zbuffer = ( stbi_uc* )ibuffer;
-    a.zbuffer_end = ( stbi_uc* )ibuffer + ilen;
-    if(stbi__do_zlib(&a, obuffer, olen, 0, 1))
-        return ( int )(a.zout - a.zout_start);
+    a.zbuffer = (stbi_uc*)ibuffer;
+    a.zbuffer_end = (stbi_uc*)ibuffer + ilen;
+    if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+        return (int)(a.zout - a.zout_start);
     else
         return -1;
 }
@@ -4571,15 +4564,15 @@ extern int stbi_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer,
 extern char* stbi_zlib_decode_noheader_malloc(char const* buffer, int len, int* outlen)
 {
     stbi__zbuf a;
-    char* p = ( char* )stbi__malloc(16384);
-    if(p == NULL)
+    char* p = (char*)stbi__malloc(16384);
+    if (p == NULL)
         return NULL;
-    a.zbuffer = ( stbi_uc* )buffer;
-    a.zbuffer_end = ( stbi_uc* )buffer + len;
-    if(stbi__do_zlib(&a, p, 16384, 1, 0))
+    a.zbuffer = (stbi_uc*)buffer;
+    a.zbuffer_end = (stbi_uc*)buffer + len;
+    if (stbi__do_zlib(&a, p, 16384, 1, 0))
     {
-        if(outlen)
-            *outlen = ( int )(a.zout - a.zout_start);
+        if (outlen)
+            *outlen = (int)(a.zout - a.zout_start);
         return a.zout_start;
     }
     else
@@ -4592,10 +4585,10 @@ extern char* stbi_zlib_decode_noheader_malloc(char const* buffer, int len, int*
 extern int stbi_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* ibuffer, int ilen)
 {
     stbi__zbuf a;
-    a.zbuffer = ( stbi_uc* )ibuffer;
-    a.zbuffer_end = ( stbi_uc* )ibuffer + ilen;
-    if(stbi__do_zlib(&a, obuffer, olen, 0, 0))
-        return ( int )(a.zout - a.zout_start);
+    a.zbuffer = (stbi_uc*)ibuffer;
+    a.zbuffer_end = (stbi_uc*)ibuffer + ilen;
+    if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+        return (int)(a.zout - a.zout_start);
     else
         return -1;
 }
@@ -4630,8 +4623,8 @@ static int stbi__check_png_header(stbi__context* s)
 {
     static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
     int i;
-    for(i = 0; i < 8; ++i)
-        if(stbi__get8(s) != png_sig[i])
+    for (i = 0; i < 8; ++i)
+        if (stbi__get8(s) != png_sig[i])
             return stbi__err("bad png sig", "Not a PNG");
     return 1;
 }
@@ -4663,9 +4656,9 @@ static int stbi__paeth(int a, int b, int c)
     int pa = abs(p - a);
     int pb = abs(p - b);
     int pc = abs(p - c);
-    if(pa <= pb && pa <= pc)
+    if (pa <= pb && pa <= pc)
         return a;
-    if(pb <= pc)
+    if (pb <= pc)
         return b;
     return c;
 }
@@ -4681,18 +4674,18 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
     stbi__uint32 i, j, stride = x * out_n * bytes;
     stbi__uint32 img_len, img_width_bytes;
     int k;
-    int img_n = s->img_n;    // copy it into a local for later
+    int img_n = s->img_n; // copy it into a local for later
 
     int output_bytes = out_n * bytes;
     int filter_bytes = img_n * bytes;
     int width = x;
 
     STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1);
-    a->out = ( stbi_uc* )stbi__malloc_mad3(x, y, output_bytes, 0);    // extra bytes to write off the end into
-    if(!a->out)
+    a->out = (stbi_uc*)stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+    if (!a->out)
         return stbi__err("outofmem", "Out of memory");
 
-    if(!stbi__mad3sizes_valid(img_n, x, depth, 7))
+    if (!stbi__mad3sizes_valid(img_n, x, depth, 7))
         return stbi__err("too large", "Corrupt PNG");
     img_width_bytes = (((img_n * x * depth) + 7) >> 3);
     img_len = (img_width_bytes + 1) * y;
@@ -4700,75 +4693,74 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
     // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
     // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
     // so just check for raw_len < img_len always.
-    if(raw_len < img_len)
+    if (raw_len < img_len)
         return stbi__err("not enough pixels", "Corrupt PNG");
 
-    for(j = 0; j < y; ++j)
+    for (j = 0; j < y; ++j)
     {
         stbi_uc* cur = a->out + stride * j;
         stbi_uc* prior;
         int filter = *raw++;
 
-        if(filter > 4)
+        if (filter > 4)
             return stbi__err("invalid filter", "Corrupt PNG");
 
-        if(depth < 8)
+        if (depth < 8)
         {
             STBI_ASSERT(img_width_bytes <= x);
-            cur += x * out_n -
-                   img_width_bytes;    // store output to the rightmost img_len bytes, so we can decode in place
+            cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
             filter_bytes = 1;
             width = img_width_bytes;
         }
-        prior = cur - stride;    // bugfix: need to compute this after 'cur +=' computation above
+        prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
 
         // if first row, use special filter that doesn't sample previous row
-        if(j == 0)
+        if (j == 0)
             filter = first_row_filter[filter];
 
         // handle first byte explicitly
-        for(k = 0; k < filter_bytes; ++k)
+        for (k = 0; k < filter_bytes; ++k)
         {
-            switch(filter)
+            switch (filter)
             {
-                case STBI__F_none:
-                    cur[k] = raw[k];
-                    break;
-                case STBI__F_sub:
-                    cur[k] = raw[k];
-                    break;
-                case STBI__F_up:
-                    cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
-                    break;
-                case STBI__F_avg:
-                    cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
-                    break;
-                case STBI__F_paeth:
-                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0));
-                    break;
-                case STBI__F_avg_first:
-                    cur[k] = raw[k];
-                    break;
-                case STBI__F_paeth_first:
-                    cur[k] = raw[k];
-                    break;
+            case STBI__F_none:
+                cur[k] = raw[k];
+                break;
+            case STBI__F_sub:
+                cur[k] = raw[k];
+                break;
+            case STBI__F_up:
+                cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+                break;
+            case STBI__F_avg:
+                cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
+                break;
+            case STBI__F_paeth:
+                cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0));
+                break;
+            case STBI__F_avg_first:
+                cur[k] = raw[k];
+                break;
+            case STBI__F_paeth_first:
+                cur[k] = raw[k];
+                break;
             }
         }
 
-        if(depth == 8)
+        if (depth == 8)
         {
-            if(img_n != out_n)
-                cur[img_n] = 255;    // first pixel
+            if (img_n != out_n)
+                cur[img_n] = 255; // first pixel
             raw += img_n;
             cur += out_n;
             prior += out_n;
         }
-        else if(depth == 16)
+        else if (depth == 16)
         {
-            if(img_n != out_n)
+            if (img_n != out_n)
             {
-                cur[filter_bytes] = 255;    // first pixel top byte
-                cur[filter_bytes + 1] = 255;    // first pixel bottom byte
+                cur[filter_bytes] = 255;     // first pixel top byte
+                cur[filter_bytes + 1] = 255; // first pixel bottom byte
             }
             raw += filter_bytes;
             cur += output_bytes;
@@ -4782,49 +4774,48 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
         }
 
         // this is a little gross, so that we don't switch per-pixel or per-component
-        if(depth < 8 || img_n == out_n)
+        if (depth < 8 || img_n == out_n)
         {
             int nk = (width - 1) * filter_bytes;
 #define STBI__CASE(f) \
     case f:           \
-        for(k = 0; k < nk; ++k)
-            switch(filter)
+        for (k = 0; k < nk; ++k)
+            switch (filter)
             {
-                // "none" filter turns into a memcpy here; make that explicit.
-                case STBI__F_none:
-                    memcpy(cur, raw, nk);
-                    break;
-                    STBI__CASE(STBI__F_sub)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]);
-                    }
-                    break;
-                    STBI__CASE(STBI__F_up)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
-                    }
-                    break;
-                    STBI__CASE(STBI__F_avg)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1));
-                    }
-                    break;
-                    STBI__CASE(STBI__F_paeth)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] +
-                                                stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes]));
-                    }
-                    break;
-                    STBI__CASE(STBI__F_avg_first)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1));
-                    }
-                    break;
-                    STBI__CASE(STBI__F_paeth_first)
-                    {
-                        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0));
-                    }
-                    break;
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:
+                memcpy(cur, raw, nk);
+                break;
+                STBI__CASE(STBI__F_sub)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]);
+                }
+                break;
+                STBI__CASE(STBI__F_up)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+                }
+                break;
+                STBI__CASE(STBI__F_avg)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1));
+                }
+                break;
+                STBI__CASE(STBI__F_paeth)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes]));
+                }
+                break;
+                STBI__CASE(STBI__F_avg_first)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1));
+                }
+                break;
+                STBI__CASE(STBI__F_paeth_first)
+                {
+                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0));
+                }
+                break;
             }
 #undef STBI__CASE
             raw += nk;
@@ -4832,12 +4823,12 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
         else
         {
             STBI_ASSERT(img_n + 1 == out_n);
-#define STBI__CASE(f)                                                                                      \
-    case f:                                                                                                \
-        for(i = x - 1; i >= 1;                                                                             \
-            --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \
-            for(k = 0; k < filter_bytes; ++k)
-            switch(filter)
+#define STBI__CASE(f)                                                                                       \
+    case f:                                                                                                 \
+        for (i = x - 1; i >= 1;                                                                             \
+             --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \
+            for (k = 0; k < filter_bytes; ++k)
+            switch (filter)
             {
                 STBI__CASE(STBI__F_none)
                 {
@@ -4861,8 +4852,7 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
                 break;
                 STBI__CASE(STBI__F_paeth)
                 {
-                    cur[k] =
-                        STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes]));
+                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes]));
                 }
                 break;
                 STBI__CASE(STBI__F_avg_first)
@@ -4880,10 +4870,10 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
 
             // the loop above sets the high byte of the pixels' alpha, but for
             // 16 bit png files we also need the low byte set. we'll do that here.
-            if(depth == 16)
+            if (depth == 16)
             {
-                cur = a->out + stride * j;    // start at the beginning of the row again
-                for(i = 0; i < x; ++i, cur += output_bytes)
+                cur = a->out + stride * j; // start at the beginning of the row again
+                for (i = 0; i < x; ++i, cur += output_bytes)
                 {
                     cur[filter_bytes + 1] = 255;
                 }
@@ -4894,17 +4884,16 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
     // we make a separate pass to expand bits to pixels; for performance,
     // this could run two scanlines behind the above code, so it won't
     // intefere with filtering but will still be in the cache.
-    if(depth < 8)
+    if (depth < 8)
     {
-        for(j = 0; j < y; ++j)
+        for (j = 0; j < y; ++j)
         {
             stbi_uc* cur = a->out + stride * j;
             stbi_uc* in = a->out + stride * j + x * out_n - img_width_bytes;
             // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for
             // 1/2/4-bit png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data
             // that will be skipped in the later loop
-            stbi_uc scale =
-                (color == 0) ? stbi__depth_scale_table[depth] : 1;    // scale grayscale values to 0..255 range
+            stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
 
             // note that the final byte might overshoot and write more data than desired.
             // we can allocate enough data that this never writes out of memory, but it
@@ -4912,35 +4901,35 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
             // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
             // so we need to explicitly clamp the final ones
 
-            if(depth == 4)
+            if (depth == 4)
             {
-                for(k = x * img_n; k >= 2; k -= 2, ++in)
+                for (k = x * img_n; k >= 2; k -= 2, ++in)
                 {
                     *cur++ = scale * ((*in >> 4));
                     *cur++ = scale * ((*in) & 0x0f);
                 }
-                if(k > 0)
+                if (k > 0)
                     *cur++ = scale * ((*in >> 4));
             }
-            else if(depth == 2)
+            else if (depth == 2)
             {
-                for(k = x * img_n; k >= 4; k -= 4, ++in)
+                for (k = x * img_n; k >= 4; k -= 4, ++in)
                 {
                     *cur++ = scale * ((*in >> 6));
                     *cur++ = scale * ((*in >> 4) & 0x03);
                     *cur++ = scale * ((*in >> 2) & 0x03);
                     *cur++ = scale * ((*in) & 0x03);
                 }
-                if(k > 0)
+                if (k > 0)
                     *cur++ = scale * ((*in >> 6));
-                if(k > 1)
+                if (k > 1)
                     *cur++ = scale * ((*in >> 4) & 0x03);
-                if(k > 2)
+                if (k > 2)
                     *cur++ = scale * ((*in >> 2) & 0x03);
             }
-            else if(depth == 1)
+            else if (depth == 1)
             {
-                for(k = x * img_n; k >= 8; k -= 8, ++in)
+                for (k = x * img_n; k >= 8; k -= 8, ++in)
                 {
                     *cur++ = scale * ((*in >> 7));
                     *cur++ = scale * ((*in >> 6) & 0x01);
@@ -4951,29 +4940,29 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
                     *cur++ = scale * ((*in >> 1) & 0x01);
                     *cur++ = scale * ((*in) & 0x01);
                 }
-                if(k > 0)
+                if (k > 0)
                     *cur++ = scale * ((*in >> 7));
-                if(k > 1)
+                if (k > 1)
                     *cur++ = scale * ((*in >> 6) & 0x01);
-                if(k > 2)
+                if (k > 2)
                     *cur++ = scale * ((*in >> 5) & 0x01);
-                if(k > 3)
+                if (k > 3)
                     *cur++ = scale * ((*in >> 4) & 0x01);
-                if(k > 4)
+                if (k > 4)
                     *cur++ = scale * ((*in >> 3) & 0x01);
-                if(k > 5)
+                if (k > 5)
                     *cur++ = scale * ((*in >> 2) & 0x01);
-                if(k > 6)
+                if (k > 6)
                     *cur++ = scale * ((*in >> 1) & 0x01);
             }
-            if(img_n != out_n)
+            if (img_n != out_n)
             {
                 int q;
                 // insert alpha = 255
                 cur = a->out + stride * j;
-                if(img_n == 1)
+                if (img_n == 1)
                 {
-                    for(q = x - 1; q >= 0; --q)
+                    for (q = x - 1; q >= 0; --q)
                     {
                         cur[q * 2 + 1] = 255;
                         cur[q * 2 + 0] = cur[q];
@@ -4982,7 +4971,7 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
                 else
                 {
                     STBI_ASSERT(img_n == 3);
-                    for(q = x - 1; q >= 0; --q)
+                    for (q = x - 1; q >= 0; --q)
                     {
                         cur[q * 4 + 3] = 255;
                         cur[q * 4 + 2] = cur[q * 3 + 2];
@@ -4993,16 +4982,16 @@ static int stbi__create_png_image_raw(stbi__png* a, stbi_uc* raw, stbi__uint32 r
             }
         }
     }
-    else if(depth == 16)
+    else if (depth == 16)
     {
         // force the image data from big-endian to platform-native.
         // this is done in a separate pass due to the decoding relying
         // on the data being untouched, but could probably be done
         // per-line during decode if care is taken.
         stbi_uc* cur = a->out;
-        stbi__uint16* cur16 = ( stbi__uint16* )cur;
+        stbi__uint16* cur16 = (stbi__uint16*)cur;
 
-        for(i = 0; i < x * y * out_n; ++i, cur16++, cur += 2)
+        for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2)
         {
             *cur16 = (cur[0] << 8) | cur[1];
         }
@@ -5018,12 +5007,12 @@ static int stbi__create_png_image(stbi__png* a, stbi_uc* image_data, stbi__uint3
     int out_bytes = out_n * bytes;
     stbi_uc* final;
     int p;
-    if(!interlaced)
+    if (!interlaced)
         return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
 
     // de-interlacing
-    final = ( stbi_uc* )stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
-    for(p = 0; p < 7; ++p)
+    final = (stbi_uc*)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+    for (p = 0; p < 7; ++p)
     {
         int xorig[] = {0, 4, 0, 2, 0, 1, 0};
         int yorig[] = {0, 0, 4, 0, 2, 0, 1};
@@ -5033,17 +5022,17 @@ static int stbi__create_png_image(stbi__png* a, stbi_uc* image_data, stbi__uint3
         // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
         x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
         y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
-        if(x && y)
+        if (x && y)
         {
             stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
-            if(!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color))
+            if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color))
             {
                 STBI_FREE(final);
                 return 0;
             }
-            for(j = 0; j < y; ++j)
+            for (j = 0; j < y; ++j)
             {
-                for(i = 0; i < x; ++i)
+                for (i = 0; i < x; ++i)
                 {
                     int out_y = j * yspc[p] + yorig[p];
                     int out_x = i * xspc[p] + xorig[p];
@@ -5071,9 +5060,9 @@ static int stbi__compute_transparency(stbi__png* z, stbi_uc tc[3], int out_n)
     // already got 255 as the alpha value in the output
     STBI_ASSERT(out_n == 2 || out_n == 4);
 
-    if(out_n == 2)
+    if (out_n == 2)
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
             p[1] = (p[0] == tc[0] ? 0 : 255);
             p += 2;
@@ -5081,9 +5070,9 @@ static int stbi__compute_transparency(stbi__png* z, stbi_uc tc[3], int out_n)
     }
     else
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
-            if(p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
                 p[3] = 0;
             p += 4;
         }
@@ -5095,15 +5084,15 @@ static int stbi__compute_transparency16(stbi__png* z, stbi__uint16 tc[3], int ou
 {
     stbi__context* s = z->s;
     stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-    stbi__uint16* p = ( stbi__uint16* )z->out;
+    stbi__uint16* p = (stbi__uint16*)z->out;
 
     // compute color-based transparency, assuming we've
     // already got 65535 as the alpha value in the output
     STBI_ASSERT(out_n == 2 || out_n == 4);
 
-    if(out_n == 2)
+    if (out_n == 2)
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
             p[1] = (p[0] == tc[0] ? 0 : 65535);
             p += 2;
@@ -5111,9 +5100,9 @@ static int stbi__compute_transparency16(stbi__png* z, stbi__uint16 tc[3], int ou
     }
     else
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
-            if(p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
                 p[3] = 0;
             p += 4;
         }
@@ -5126,16 +5115,16 @@ static int stbi__expand_png_palette(stbi__png* a, stbi_uc* palette, int len, int
     stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
     stbi_uc *p, *temp_out, *orig = a->out;
 
-    p = ( stbi_uc* )stbi__malloc_mad2(pixel_count, pal_img_n, 0);
-    if(p == NULL)
+    p = (stbi_uc*)stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+    if (p == NULL)
         return stbi__err("outofmem", "Out of memory");
 
     // between here and free(out) below, exitting would leak
     temp_out = p;
 
-    if(pal_img_n == 3)
+    if (pal_img_n == 3)
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
             int n = orig[i] * 4;
             p[0] = palette[n];
@@ -5146,7 +5135,7 @@ static int stbi__expand_png_palette(stbi__png* a, stbi_uc* palette, int len, int
     }
     else
     {
-        for(i = 0; i < pixel_count; ++i)
+        for (i = 0; i < pixel_count; ++i)
         {
             int n = orig[i] * 4;
             p[0] = palette[n];
@@ -5183,9 +5172,9 @@ static void stbi__de_iphone(stbi__png* z)
     stbi__uint32 i, pixel_count = s->img_x * s->img_y;
     stbi_uc* p = z->out;
 
-    if(s->img_out_n == 3)
-    {    // convert bgr to rgb
-        for(i = 0; i < pixel_count; ++i)
+    if (s->img_out_n == 3)
+    { // convert bgr to rgb
+        for (i = 0; i < pixel_count; ++i)
         {
             stbi_uc t = p[0];
             p[0] = p[2];
@@ -5196,14 +5185,14 @@ static void stbi__de_iphone(stbi__png* z)
     else
     {
         STBI_ASSERT(s->img_out_n == 4);
-        if(stbi__unpremultiply_on_load)
+        if (stbi__unpremultiply_on_load)
         {
             // convert bgr to rgb and unpremultiply
-            for(i = 0; i < pixel_count; ++i)
+            for (i = 0; i < pixel_count; ++i)
             {
                 stbi_uc a = p[3];
                 stbi_uc t = p[0];
-                if(a)
+                if (a)
                 {
                     stbi_uc half = a / 2;
                     p[0] = (p[2] * 255 + half) / a;
@@ -5221,7 +5210,7 @@ static void stbi__de_iphone(stbi__png* z)
         else
         {
             // convert bgr to rgb
-            for(i = 0; i < pixel_count; ++i)
+            for (i = 0; i < pixel_count; ++i)
             {
                 stbi_uc t = p[0];
                 p[0] = p[2];
@@ -5233,7 +5222,7 @@ static void stbi__de_iphone(stbi__png* z)
 }
 
 #define STBI__PNG_TYPE(a, b, c, d) \
-    ((( unsigned )(a) << 24) + (( unsigned )(b) << 16) + (( unsigned )(c) << 8) + ( unsigned )(d))
+    (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) + (unsigned)(d))
 
 static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp)
 {
@@ -5248,250 +5237,249 @@ static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp)
     z->idata = NULL;
     z->out = NULL;
 
-    if(!stbi__check_png_header(s))
+    if (!stbi__check_png_header(s))
         return 0;
 
-    if(scan == STBI__SCAN_type)
+    if (scan == STBI__SCAN_type)
         return 1;
 
-    for(;;)
+    for (;;)
     {
         stbi__pngchunk c = stbi__get_chunk_header(s);
-        switch(c.type)
+        switch (c.type)
         {
-            case STBI__PNG_TYPE('C', 'g', 'B', 'I'):
-                is_iphone = 1;
-                stbi__skip(s, c.length);
-                break;
-            case STBI__PNG_TYPE('I', 'H', 'D', 'R'):
+        case STBI__PNG_TYPE('C', 'g', 'B', 'I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+        case STBI__PNG_TYPE('I', 'H', 'D', 'R'):
+        {
+            int comp, filter;
+            if (!first)
+                return stbi__err("multiple IHDR", "Corrupt PNG");
+            first = 0;
+            if (c.length != 13)
+                return stbi__err("bad IHDR len", "Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            if (s->img_x > (1 << 24))
+                return stbi__err("too large", "Very large image (corrupt?)");
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > (1 << 24))
+                return stbi__err("too large", "Very large image (corrupt?)");
+            z->depth = stbi__get8(s);
+            if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)
+                return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);
+            if (color > 6)
+                return stbi__err("bad ctype", "Corrupt PNG");
+            if (color == 3 && z->depth == 16)
+                return stbi__err("bad ctype", "Corrupt PNG");
+            if (color == 3)
+                pal_img_n = 3;
+            else if (color & 1)
+                return stbi__err("bad ctype", "Corrupt PNG");
+            comp = stbi__get8(s);
+            if (comp)
+                return stbi__err("bad comp method", "Corrupt PNG");
+            filter = stbi__get8(s);
+            if (filter)
+                return stbi__err("bad filter method", "Corrupt PNG");
+            interlace = stbi__get8(s);
+            if (interlace > 1)
+                return stbi__err("bad interlace method", "Corrupt PNG");
+            if (!s->img_x || !s->img_y)
+                return stbi__err("0-pixel image", "Corrupt PNG");
+            if (!pal_img_n)
             {
-                int comp, filter;
-                if(!first)
-                    return stbi__err("multiple IHDR", "Corrupt PNG");
-                first = 0;
-                if(c.length != 13)
-                    return stbi__err("bad IHDR len", "Corrupt PNG");
-                s->img_x = stbi__get32be(s);
-                if(s->img_x > (1 << 24))
-                    return stbi__err("too large", "Very large image (corrupt?)");
-                s->img_y = stbi__get32be(s);
-                if(s->img_y > (1 << 24))
-                    return stbi__err("too large", "Very large image (corrupt?)");
-                z->depth = stbi__get8(s);
-                if(z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)
-                    return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only");
-                color = stbi__get8(s);
-                if(color > 6)
-                    return stbi__err("bad ctype", "Corrupt PNG");
-                if(color == 3 && z->depth == 16)
-                    return stbi__err("bad ctype", "Corrupt PNG");
-                if(color == 3)
-                    pal_img_n = 3;
-                else if(color & 1)
-                    return stbi__err("bad ctype", "Corrupt PNG");
-                comp = stbi__get8(s);
-                if(comp)
-                    return stbi__err("bad comp method", "Corrupt PNG");
-                filter = stbi__get8(s);
-                if(filter)
-                    return stbi__err("bad filter method", "Corrupt PNG");
-                interlace = stbi__get8(s);
-                if(interlace > 1)
-                    return stbi__err("bad interlace method", "Corrupt PNG");
-                if(!s->img_x || !s->img_y)
-                    return stbi__err("0-pixel image", "Corrupt PNG");
-                if(!pal_img_n)
-                {
-                    s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
-                    if((1 << 30) / s->img_x / s->img_n < s->img_y)
-                        return stbi__err("too large", "Image too large to decode");
-                    if(scan == STBI__SCAN_header)
-                        return 1;
-                }
-                else
-                {
-                    // if paletted, then pal_n is our final components, and
-                    // img_n is # components to decompress/filter.
-                    s->img_n = 1;
-                    if((1 << 30) / s->img_x / 4 < s->img_y)
-                        return stbi__err("too large", "Corrupt PNG");
-                    // if SCAN_header, have to scan to see if we have a tRNS
-                }
-                break;
+                s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+                if ((1 << 30) / s->img_x / s->img_n < s->img_y)
+                    return stbi__err("too large", "Image too large to decode");
+                if (scan == STBI__SCAN_header)
+                    return 1;
+            }
+            else
+            {
+                // if paletted, then pal_n is our final components, and
+                // img_n is # components to decompress/filter.
+                s->img_n = 1;
+                if ((1 << 30) / s->img_x / 4 < s->img_y)
+                    return stbi__err("too large", "Corrupt PNG");
+                // if SCAN_header, have to scan to see if we have a tRNS
             }
+            break;
+        }
 
-            case STBI__PNG_TYPE('P', 'L', 'T', 'E'):
+        case STBI__PNG_TYPE('P', 'L', 'T', 'E'):
+        {
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256 * 3)
+                return stbi__err("invalid PLTE", "Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length)
+                return stbi__err("invalid PLTE", "Corrupt PNG");
+            for (i = 0; i < pal_len; ++i)
+            {
+                palette[i * 4 + 0] = stbi__get8(s);
+                palette[i * 4 + 1] = stbi__get8(s);
+                palette[i * 4 + 2] = stbi__get8(s);
+                palette[i * 4 + 3] = 255;
+            }
+            break;
+        }
+
+        case STBI__PNG_TYPE('t', 'R', 'N', 'S'):
+        {
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata)
+                return stbi__err("tRNS after IDAT", "Corrupt PNG");
+            if (pal_img_n)
             {
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if(c.length > 256 * 3)
-                    return stbi__err("invalid PLTE", "Corrupt PNG");
-                pal_len = c.length / 3;
-                if(pal_len * 3 != c.length)
-                    return stbi__err("invalid PLTE", "Corrupt PNG");
-                for(i = 0; i < pal_len; ++i)
+                if (scan == STBI__SCAN_header)
                 {
-                    palette[i * 4 + 0] = stbi__get8(s);
-                    palette[i * 4 + 1] = stbi__get8(s);
-                    palette[i * 4 + 2] = stbi__get8(s);
-                    palette[i * 4 + 3] = 255;
+                    s->img_n = 4;
+                    return 1;
                 }
-                break;
+                if (pal_len == 0)
+                    return stbi__err("tRNS before PLTE", "Corrupt PNG");
+                if (c.length > pal_len)
+                    return stbi__err("bad tRNS len", "Corrupt PNG");
+                pal_img_n = 4;
+                for (i = 0; i < c.length; ++i)
+                    palette[i * 4 + 3] = stbi__get8(s);
             }
-
-            case STBI__PNG_TYPE('t', 'R', 'N', 'S'):
+            else
             {
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if(z->idata)
-                    return stbi__err("tRNS after IDAT", "Corrupt PNG");
-                if(pal_img_n)
+                if (!(s->img_n & 1))
+                    return stbi__err("tRNS with alpha", "Corrupt PNG");
+                if (c.length != (stbi__uint32)s->img_n * 2)
+                    return stbi__err("bad tRNS len", "Corrupt PNG");
+                has_trans = 1;
+                if (z->depth == 16)
                 {
-                    if(scan == STBI__SCAN_header)
-                    {
-                        s->img_n = 4;
-                        return 1;
-                    }
-                    if(pal_len == 0)
-                        return stbi__err("tRNS before PLTE", "Corrupt PNG");
-                    if(c.length > pal_len)
-                        return stbi__err("bad tRNS len", "Corrupt PNG");
-                    pal_img_n = 4;
-                    for(i = 0; i < c.length; ++i)
-                        palette[i * 4 + 3] = stbi__get8(s);
+                    for (k = 0; k < s->img_n; ++k)
+                        tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
                 }
                 else
                 {
-                    if(!(s->img_n & 1))
-                        return stbi__err("tRNS with alpha", "Corrupt PNG");
-                    if(c.length != ( stbi__uint32 )s->img_n * 2)
-                        return stbi__err("bad tRNS len", "Corrupt PNG");
-                    has_trans = 1;
-                    if(z->depth == 16)
-                    {
-                        for(k = 0; k < s->img_n; ++k)
-                            tc16[k] = ( stbi__uint16 )stbi__get16be(s);    // copy the values as-is
-                    }
-                    else
-                    {
-                        for(k = 0; k < s->img_n; ++k)
-                            tc[k] = (stbi_uc)(stbi__get16be(s) & 255) *
-                                    stbi__depth_scale_table[z->depth];    // non 8-bit images will be larger
-                    }
+                    for (k = 0; k < s->img_n; ++k)
+                        tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
                 }
-                break;
             }
+            break;
+        }
 
-            case STBI__PNG_TYPE('I', 'D', 'A', 'T'):
+        case STBI__PNG_TYPE('I', 'D', 'A', 'T'):
+        {
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len)
+                return stbi__err("no PLTE", "Corrupt PNG");
+            if (scan == STBI__SCAN_header)
             {
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if(pal_img_n && !pal_len)
-                    return stbi__err("no PLTE", "Corrupt PNG");
-                if(scan == STBI__SCAN_header)
-                {
-                    s->img_n = pal_img_n;
-                    return 1;
-                }
-                if(( int )(ioff + c.length) < ( int )ioff)
-                    return 0;
-                if(ioff + c.length > idata_limit)
-                {
-                    stbi__uint32 idata_limit_old = idata_limit;
-                    stbi_uc* p;
-                    if(idata_limit == 0)
-                        idata_limit = c.length > 4096 ? c.length : 4096;
-                    while(ioff + c.length > idata_limit)
-                        idata_limit *= 2;
-                    STBI_NOTUSED(idata_limit_old);
-                    p = ( stbi_uc* )STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit);
-                    if(p == NULL)
-                        return stbi__err("outofmem", "Out of memory");
-                    z->idata = p;
-                }
-                if(!stbi__getn(s, z->idata + ioff, c.length))
-                    return stbi__err("outofdata", "Corrupt PNG");
-                ioff += c.length;
-                break;
+                s->img_n = pal_img_n;
+                return 1;
+            }
+            if ((int)(ioff + c.length) < (int)ioff)
+                return 0;
+            if (ioff + c.length > idata_limit)
+            {
+                stbi__uint32 idata_limit_old = idata_limit;
+                stbi_uc* p;
+                if (idata_limit == 0)
+                    idata_limit = c.length > 4096 ? c.length : 4096;
+                while (ioff + c.length > idata_limit)
+                    idata_limit *= 2;
+                STBI_NOTUSED(idata_limit_old);
+                p = (stbi_uc*)STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit);
+                if (p == NULL)
+                    return stbi__err("outofmem", "Out of memory");
+                z->idata = p;
             }
+            if (!stbi__getn(s, z->idata + ioff, c.length))
+                return stbi__err("outofdata", "Corrupt PNG");
+            ioff += c.length;
+            break;
+        }
 
-            case STBI__PNG_TYPE('I', 'E', 'N', 'D'):
+        case STBI__PNG_TYPE('I', 'E', 'N', 'D'):
+        {
+            stbi__uint32 raw_len, bpl;
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load)
+                return 1;
+            if (z->idata == NULL)
+                return stbi__err("no IDAT", "Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc*)stbi_zlib_decode_malloc_guesssize_headerflag((char*)z->idata, ioff, raw_len,
+                                                                                 (int*)&raw_len, !is_iphone);
+            if (z->expanded == NULL)
+                return 0; // zlib should set error
+            STBI_FREE(z->idata);
+            z->idata = NULL;
+            if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans)
+                s->img_out_n = s->img_n + 1;
+            else
+                s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace))
+                return 0;
+            if (has_trans)
             {
-                stbi__uint32 raw_len, bpl;
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if(scan != STBI__SCAN_load)
-                    return 1;
-                if(z->idata == NULL)
-                    return stbi__err("no IDAT", "Corrupt PNG");
-                // initial guess for decoded data size to avoid unnecessary reallocs
-                bpl = (s->img_x * z->depth + 7) / 8;    // bytes per line, per component
-                raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
-                z->expanded = ( stbi_uc* )stbi_zlib_decode_malloc_guesssize_headerflag(( char* )z->idata, ioff, raw_len,
-                                                                                       ( int* )&raw_len, !is_iphone);
-                if(z->expanded == NULL)
-                    return 0;    // zlib should set error
-                STBI_FREE(z->idata);
-                z->idata = NULL;
-                if((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans)
-                    s->img_out_n = s->img_n + 1;
-                else
-                    s->img_out_n = s->img_n;
-                if(!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace))
-                    return 0;
-                if(has_trans)
+                if (z->depth == 16)
                 {
-                    if(z->depth == 16)
-                    {
-                        if(!stbi__compute_transparency16(z, tc16, s->img_out_n))
-                            return 0;
-                    }
-                    else
-                    {
-                        if(!stbi__compute_transparency(z, tc, s->img_out_n))
-                            return 0;
-                    }
-                }
-                if(is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
-                    stbi__de_iphone(z);
-                if(pal_img_n)
-                {
-                    // pal_img_n == 3 or 4
-                    s->img_n = pal_img_n;    // record the actual colors we had
-                    s->img_out_n = pal_img_n;
-                    if(req_comp >= 3)
-                        s->img_out_n = req_comp;
-                    if(!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                    if (!stbi__compute_transparency16(z, tc16, s->img_out_n))
                         return 0;
                 }
-                else if(has_trans)
+                else
                 {
-                    // non-paletted image with tRNS -> source image has (constant) alpha
-                    ++s->img_n;
+                    if (!stbi__compute_transparency(z, tc, s->img_out_n))
+                        return 0;
                 }
-                STBI_FREE(z->expanded);
-                z->expanded = NULL;
-                return 1;
             }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+                stbi__de_iphone(z);
+            if (pal_img_n)
+            {
+                // pal_img_n == 3 or 4
+                s->img_n = pal_img_n; // record the actual colors we had
+                s->img_out_n = pal_img_n;
+                if (req_comp >= 3)
+                    s->img_out_n = req_comp;
+                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                    return 0;
+            }
+            else if (has_trans)
+            {
+                // non-paletted image with tRNS -> source image has (constant) alpha
+                ++s->img_n;
+            }
+            STBI_FREE(z->expanded);
+            z->expanded = NULL;
+            return 1;
+        }
 
-            default:
-                // if critical, fail
-                if(first)
-                    return stbi__err("first not IHDR", "Corrupt PNG");
-                if((c.type & (1 << 29)) == 0)
-                {
+        default:
+            // if critical, fail
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0)
+            {
 #ifndef STBI_NO_FAILURE_STRINGS
-                    // not threadsafe
-                    static char invalid_chunk[] = "XXXX PNG chunk not known";
-                    invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
-                    invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
-                    invalid_chunk[2] = STBI__BYTECAST(c.type >> 8);
-                    invalid_chunk[3] = STBI__BYTECAST(c.type >> 0);
+                // not threadsafe
+                static char invalid_chunk[] = "XXXX PNG chunk not known";
+                invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+                invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+                invalid_chunk[2] = STBI__BYTECAST(c.type >> 8);
+                invalid_chunk[3] = STBI__BYTECAST(c.type >> 0);
 #endif
-                    return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
-                }
-                stbi__skip(s, c.length);
-                break;
+                return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
         }
         // end of PNG chunk, read and skip CRC
         stbi__get32be(s);
@@ -5501,31 +5489,30 @@ static int stbi__parse_png_file(stbi__png* z, int scan, int req_comp)
 static void* stbi__do_png(stbi__png* p, int* x, int* y, int* n, int req_comp, stbi__result_info* ri)
 {
     void* result = NULL;
-    if(req_comp < 0 || req_comp > 4)
+    if (req_comp < 0 || req_comp > 4)
         return stbi__errpuc("bad req_comp", "Internal error");
-    if(stbi__parse_png_file(p, STBI__SCAN_load, req_comp))
+    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp))
     {
-        if(p->depth < 8)
+        if (p->depth < 8)
             ri->bits_per_channel = 8;
         else
             ri->bits_per_channel = p->depth;
         result = p->out;
         p->out = NULL;
-        if(req_comp && req_comp != p->s->img_out_n)
+        if (req_comp && req_comp != p->s->img_out_n)
         {
-            if(ri->bits_per_channel == 8)
-                result =
-                    stbi__convert_format(( unsigned char* )result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+            if (ri->bits_per_channel == 8)
+                result = stbi__convert_format((unsigned char*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
             else
-                result = stbi__convert_format16(( stbi__uint16* )result, p->s->img_out_n, req_comp, p->s->img_x,
+                result = stbi__convert_format16((stbi__uint16*)result, p->s->img_out_n, req_comp, p->s->img_x,
                                                 p->s->img_y);
             p->s->img_out_n = req_comp;
-            if(result == NULL)
+            if (result == NULL)
                 return result;
         }
         *x = p->s->img_x;
         *y = p->s->img_y;
-        if(n)
+        if (n)
             *n = p->s->img_n;
     }
     STBI_FREE(p->out);
@@ -5555,16 +5542,16 @@ static int stbi__png_test(stbi__context* s)
 
 static int stbi__png_info_raw(stbi__png* p, int* x, int* y, int* comp)
 {
-    if(!stbi__parse_png_file(p, STBI__SCAN_header, 0))
+    if (!stbi__parse_png_file(p, STBI__SCAN_header, 0))
     {
         stbi__rewind(p->s);
         return 0;
     }
-    if(x)
+    if (x)
         *x = p->s->img_x;
-    if(y)
+    if (y)
         *y = p->s->img_y;
-    if(comp)
+    if (comp)
         *comp = p->s->img_n;
     return 1;
 }
@@ -5580,9 +5567,9 @@ static int stbi__png_is16(stbi__context* s)
 {
     stbi__png p;
     p.s = s;
-    if(!stbi__png_info_raw(&p, NULL, NULL, NULL))
+    if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
         return 0;
-    if(p.depth != 16)
+    if (p.depth != 16)
     {
         stbi__rewind(p.s);
         return 0;
@@ -5598,14 +5585,14 @@ static int stbi__bmp_test_raw(stbi__context* s)
 {
     int r;
     int sz;
-    if(stbi__get8(s) != 'B')
+    if (stbi__get8(s) != 'B')
         return 0;
-    if(stbi__get8(s) != 'M')
+    if (stbi__get8(s) != 'M')
         return 0;
-    stbi__get32le(s);    // discard filesize
-    stbi__get16le(s);    // discard reserved
-    stbi__get16le(s);    // discard reserved
-    stbi__get32le(s);    // discard data offset
+    stbi__get32le(s); // discard filesize
+    stbi__get16le(s); // discard reserved
+    stbi__get16le(s); // discard reserved
+    stbi__get32le(s); // discard data offset
     sz = stbi__get32le(s);
     r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
     return r;
@@ -5622,28 +5609,28 @@ static int stbi__bmp_test(stbi__context* s)
 static int stbi__high_bit(unsigned int z)
 {
     int n = 0;
-    if(z == 0)
+    if (z == 0)
         return -1;
-    if(z >= 0x10000)
+    if (z >= 0x10000)
         n += 16, z >>= 16;
-    if(z >= 0x00100)
+    if (z >= 0x00100)
         n += 8, z >>= 8;
-    if(z >= 0x00010)
+    if (z >= 0x00010)
         n += 4, z >>= 4;
-    if(z >= 0x00004)
+    if (z >= 0x00004)
         n += 2, z >>= 2;
-    if(z >= 0x00002)
+    if (z >= 0x00002)
         n += 1, z >>= 1;
     return n;
 }
 
 static int stbi__bitcount(unsigned int a)
 {
-    a = (a & 0x55555555) + ((a >> 1) & 0x55555555);    // max 2
-    a = (a & 0x33333333) + ((a >> 2) & 0x33333333);    // max 4
-    a = (a + (a >> 4)) & 0x0f0f0f0f;    // max 8 per 4, now 8 bits
-    a = (a + (a >> 8));    // max 16 per 8 bits
-    a = (a + (a >> 16));    // max 32 per 8 bits
+    a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2
+    a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4
+    a = (a + (a >> 4)) & 0x0f0f0f0f;                // max 8 per 4, now 8 bits
+    a = (a + (a >> 8));                             // max 16 per 8 bits
+    a = (a + (a >> 16));                            // max 32 per 8 bits
     return a & 0xff;
 }
 
@@ -5664,16 +5651,24 @@ static int stbi__shiftsigned(int v, int shift, int bits)
         0x01 /*0b00000001*/,
     };
     static unsigned int shift_table[9] = {
-        0, 0, 0, 1, 0, 2, 4, 6, 0,
+        0,
+        0,
+        0,
+        1,
+        0,
+        2,
+        4,
+        6,
+        0,
     };
-    if(shift < 0)
+    if (shift < 0)
         v <<= -shift;
     else
         v >>= shift;
     STBI_ASSERT(v >= 0 && v < 256);
     v >>= (8 - bits);
     STBI_ASSERT(bits >= 0 && bits <= 8);
-    return ( int )(( unsigned )v * mul_table[bits]) >> shift_table[bits];
+    return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits];
 }
 
 typedef struct
@@ -5685,18 +5680,18 @@ typedef struct
 static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info)
 {
     int hsz;
-    if(stbi__get8(s) != 'B' || stbi__get8(s) != 'M')
+    if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M')
         return stbi__errpuc("not BMP", "Corrupt BMP");
-    stbi__get32le(s);    // discard filesize
-    stbi__get16le(s);    // discard reserved
-    stbi__get16le(s);    // discard reserved
+    stbi__get32le(s); // discard filesize
+    stbi__get16le(s); // discard reserved
+    stbi__get16le(s); // discard reserved
     info->offset = stbi__get32le(s);
     info->hsz = hsz = stbi__get32le(s);
     info->mr = info->mg = info->mb = info->ma = 0;
 
-    if(hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124)
+    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124)
         return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
-    if(hsz == 12)
+    if (hsz == 12)
     {
         s->img_x = stbi__get16le(s);
         s->img_y = stbi__get16le(s);
@@ -5706,39 +5701,39 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info)
         s->img_x = stbi__get32le(s);
         s->img_y = stbi__get32le(s);
     }
-    if(stbi__get16le(s) != 1)
+    if (stbi__get16le(s) != 1)
         return stbi__errpuc("bad BMP", "bad BMP");
     info->bpp = stbi__get16le(s);
-    if(hsz != 12)
+    if (hsz != 12)
     {
         int compress = stbi__get32le(s);
-        if(compress == 1 || compress == 2)
+        if (compress == 1 || compress == 2)
             return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
-        stbi__get32le(s);    // discard sizeof
-        stbi__get32le(s);    // discard hres
-        stbi__get32le(s);    // discard vres
-        stbi__get32le(s);    // discard colorsused
-        stbi__get32le(s);    // discard max important
-        if(hsz == 40 || hsz == 56)
-        {
-            if(hsz == 56)
+        stbi__get32le(s); // discard sizeof
+        stbi__get32le(s); // discard hres
+        stbi__get32le(s); // discard vres
+        stbi__get32le(s); // discard colorsused
+        stbi__get32le(s); // discard max important
+        if (hsz == 40 || hsz == 56)
+        {
+            if (hsz == 56)
             {
                 stbi__get32le(s);
                 stbi__get32le(s);
                 stbi__get32le(s);
                 stbi__get32le(s);
             }
-            if(info->bpp == 16 || info->bpp == 32)
+            if (info->bpp == 16 || info->bpp == 32)
             {
-                if(compress == 0)
+                if (compress == 0)
                 {
-                    if(info->bpp == 32)
+                    if (info->bpp == 32)
                     {
                         info->mr = 0xffu << 16;
                         info->mg = 0xffu << 8;
                         info->mb = 0xffu << 0;
                         info->ma = 0xffu << 24;
-                        info->all_a = 0;    // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+                        info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
                     }
                     else
                     {
@@ -5747,13 +5742,13 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info)
                         info->mb = 31u << 0;
                     }
                 }
-                else if(compress == 3)
+                else if (compress == 3)
                 {
                     info->mr = stbi__get32le(s);
                     info->mg = stbi__get32le(s);
                     info->mb = stbi__get32le(s);
                     // not documented, but generated by photoshop and handled by mspaint
-                    if(info->mr == info->mg && info->mg == info->mb)
+                    if (info->mr == info->mg && info->mg == info->mb)
                     {
                         // ?!?!?
                         return stbi__errpuc("bad BMP", "bad BMP");
@@ -5766,25 +5761,25 @@ static void* stbi__bmp_parse_header(stbi__context* s, stbi__bmp_data* info)
         else
         {
             int i;
-            if(hsz != 108 && hsz != 124)
+            if (hsz != 108 && hsz != 124)
                 return stbi__errpuc("bad BMP", "bad BMP");
             info->mr = stbi__get32le(s);
             info->mg = stbi__get32le(s);
             info->mb = stbi__get32le(s);
             info->ma = stbi__get32le(s);
-            stbi__get32le(s);    // discard color space
-            for(i = 0; i < 12; ++i)
-                stbi__get32le(s);    // discard color space parameters
-            if(hsz == 124)
+            stbi__get32le(s); // discard color space
+            for (i = 0; i < 12; ++i)
+                stbi__get32le(s); // discard color space parameters
+            if (hsz == 124)
             {
-                stbi__get32le(s);    // discard rendering intent
-                stbi__get32le(s);    // discard offset of profile data
-                stbi__get32le(s);    // discard size of profile data
-                stbi__get32le(s);    // discard reserved
+                stbi__get32le(s); // discard rendering intent
+                stbi__get32le(s); // discard offset of profile data
+                stbi__get32le(s); // discard size of profile data
+                stbi__get32le(s); // discard reserved
             }
         }
     }
-    return ( void* )1;
+    return (void*)1;
 }
 
 static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req_comp, stbi__result_info* ri)
@@ -5798,11 +5793,11 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
     STBI_NOTUSED(ri);
 
     info.all_a = 255;
-    if(stbi__bmp_parse_header(s, &info) == NULL)
-        return NULL;    // error code already set
+    if (stbi__bmp_parse_header(s, &info) == NULL)
+        return NULL; // error code already set
 
-    flip_vertically = (( int )s->img_y) > 0;
-    s->img_y = abs(( int )s->img_y);
+    flip_vertically = ((int)s->img_y) > 0;
+    s->img_y = abs((int)s->img_y);
 
     mr = info.mr;
     mg = info.mg;
@@ -5810,53 +5805,53 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
     ma = info.ma;
     all_a = info.all_a;
 
-    if(info.hsz == 12)
+    if (info.hsz == 12)
     {
-        if(info.bpp < 24)
+        if (info.bpp < 24)
             psize = (info.offset - 14 - 24) / 3;
     }
     else
     {
-        if(info.bpp < 16)
+        if (info.bpp < 16)
             psize = (info.offset - 14 - info.hsz) >> 2;
     }
 
     s->img_n = ma ? 4 : 3;
-    if(req_comp && req_comp >= 3)    // we can directly decode 3 or 4
+    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
         target = req_comp;
     else
-        target = s->img_n;    // if they want monochrome, we'll post-convert
+        target = s->img_n; // if they want monochrome, we'll post-convert
 
     // sanity-check size
-    if(!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+    if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
         return stbi__errpuc("too large", "Corrupt BMP");
 
-    out = ( stbi_uc* )stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
-    if(!out)
+    out = (stbi_uc*)stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+    if (!out)
         return stbi__errpuc("outofmem", "Out of memory");
-    if(info.bpp < 16)
+    if (info.bpp < 16)
     {
         int z = 0;
-        if(psize == 0 || psize > 256)
+        if (psize == 0 || psize > 256)
         {
             STBI_FREE(out);
             return stbi__errpuc("invalid", "Corrupt BMP");
         }
-        for(i = 0; i < psize; ++i)
+        for (i = 0; i < psize; ++i)
         {
             pal[i][2] = stbi__get8(s);
             pal[i][1] = stbi__get8(s);
             pal[i][0] = stbi__get8(s);
-            if(info.hsz != 12)
+            if (info.hsz != 12)
                 stbi__get8(s);
             pal[i][3] = 255;
         }
         stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
-        if(info.bpp == 1)
+        if (info.bpp == 1)
             width = (s->img_x + 7) >> 3;
-        else if(info.bpp == 4)
+        else if (info.bpp == 4)
             width = (s->img_x + 1) >> 1;
-        else if(info.bpp == 8)
+        else if (info.bpp == 8)
             width = s->img_x;
         else
         {
@@ -5864,18 +5859,18 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
             return stbi__errpuc("bad bpp", "Corrupt BMP");
         }
         pad = (-width) & 3;
-        if(info.bpp == 1)
+        if (info.bpp == 1)
         {
-            for(j = 0; j < ( int )s->img_y; ++j)
+            for (j = 0; j < (int)s->img_y; ++j)
             {
                 int bit_offset = 7, v = stbi__get8(s);
-                for(i = 0; i < ( int )s->img_x; ++i)
+                for (i = 0; i < (int)s->img_x; ++i)
                 {
                     int color = (v >> bit_offset) & 0x1;
                     out[z++] = pal[color][0];
                     out[z++] = pal[color][1];
                     out[z++] = pal[color][2];
-                    if((--bit_offset) < 0)
+                    if ((--bit_offset) < 0)
                     {
                         bit_offset = 7;
                         v = stbi__get8(s);
@@ -5886,12 +5881,12 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
         }
         else
         {
-            for(j = 0; j < ( int )s->img_y; ++j)
+            for (j = 0; j < (int)s->img_y; ++j)
             {
-                for(i = 0; i < ( int )s->img_x; i += 2)
+                for (i = 0; i < (int)s->img_x; i += 2)
                 {
                     int v = stbi__get8(s), v2 = 0;
-                    if(info.bpp == 4)
+                    if (info.bpp == 4)
                     {
                         v2 = v & 15;
                         v >>= 4;
@@ -5899,15 +5894,15 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
                     out[z++] = pal[v][0];
                     out[z++] = pal[v][1];
                     out[z++] = pal[v][2];
-                    if(target == 4)
+                    if (target == 4)
                         out[z++] = 255;
-                    if(i + 1 == ( int )s->img_x)
+                    if (i + 1 == (int)s->img_x)
                         break;
                     v = (info.bpp == 8) ? stbi__get8(s) : v2;
                     out[z++] = pal[v][0];
                     out[z++] = pal[v][1];
                     out[z++] = pal[v][2];
-                    if(target == 4)
+                    if (target == 4)
                         out[z++] = 255;
                 }
                 stbi__skip(s, pad);
@@ -5920,25 +5915,25 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
         int z = 0;
         int easy = 0;
         stbi__skip(s, info.offset - 14 - info.hsz);
-        if(info.bpp == 24)
+        if (info.bpp == 24)
             width = 3 * s->img_x;
-        else if(info.bpp == 16)
+        else if (info.bpp == 16)
             width = 2 * s->img_x;
         else /* bpp = 32 and pad = 0 */
             width = 0;
         pad = (-width) & 3;
-        if(info.bpp == 24)
+        if (info.bpp == 24)
         {
             easy = 1;
         }
-        else if(info.bpp == 32)
+        else if (info.bpp == 32)
         {
-            if(mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
                 easy = 2;
         }
-        if(!easy)
+        if (!easy)
         {
-            if(!mr || !mg || !mb)
+            if (!mr || !mg || !mb)
             {
                 STBI_FREE(out);
                 return stbi__errpuc("bad masks", "Corrupt BMP");
@@ -5953,11 +5948,11 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
             ashift = stbi__high_bit(ma) - 7;
             acount = stbi__bitcount(ma);
         }
-        for(j = 0; j < ( int )s->img_y; ++j)
+        for (j = 0; j < (int)s->img_y; ++j)
         {
-            if(easy)
+            if (easy)
             {
-                for(i = 0; i < ( int )s->img_x; ++i)
+                for (i = 0; i < (int)s->img_x; ++i)
                 {
                     unsigned char a;
                     out[z + 2] = stbi__get8(s);
@@ -5966,23 +5961,23 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
                     z += 3;
                     a = (easy == 2 ? stbi__get8(s) : 255);
                     all_a |= a;
-                    if(target == 4)
+                    if (target == 4)
                         out[z++] = a;
                 }
             }
             else
             {
                 int bpp = info.bpp;
-                for(i = 0; i < ( int )s->img_x; ++i)
+                for (i = 0; i < (int)s->img_x; ++i)
                 {
-                    stbi__uint32 v = (bpp == 16 ? ( stbi__uint32 )stbi__get16le(s) : stbi__get32le(s));
+                    stbi__uint32 v = (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s));
                     unsigned int a;
                     out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
                     out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
                     out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
                     a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
                     all_a |= a;
-                    if(target == 4)
+                    if (target == 4)
                         out[z++] = STBI__BYTECAST(a);
                 }
             }
@@ -5991,34 +5986,34 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
     }
 
     // if alpha channel is all 0s, replace with all 255s
-    if(target == 4 && all_a == 0)
-        for(i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4)
+    if (target == 4 && all_a == 0)
+        for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4)
             out[i] = 255;
 
-    if(flip_vertically)
+    if (flip_vertically)
     {
         stbi_uc t;
-        for(j = 0; j<( int )s->img_y>> 1; ++j)
+        for (j = 0; j < (int)s->img_y >> 1; ++j)
         {
             stbi_uc* p1 = out + j * s->img_x * target;
             stbi_uc* p2 = out + (s->img_y - 1 - j) * s->img_x * target;
-            for(i = 0; i < ( int )s->img_x * target; ++i)
+            for (i = 0; i < (int)s->img_x * target; ++i)
             {
                 t = p1[i], p1[i] = p2[i], p2[i] = t;
             }
         }
     }
 
-    if(req_comp && req_comp != target)
+    if (req_comp && req_comp != target)
     {
         out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
-        if(out == NULL)
-            return out;    // stbi__convert_format frees input on failure
+        if (out == NULL)
+            return out; // stbi__convert_format frees input on failure
     }
 
     *x = s->img_x;
     *y = s->img_y;
-    if(comp)
+    if (comp)
         *comp = s->img_n;
     return out;
 }
@@ -6031,25 +6026,25 @@ static void* stbi__bmp_load(stbi__context* s, int* x, int* y, int* comp, int req
 static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
 {
     // only RGB or RGBA (incl. 16bit) or grey allowed
-    if(is_rgb16)
+    if (is_rgb16)
         *is_rgb16 = 0;
-    switch(bits_per_pixel)
-    {
-        case 8:
-            return STBI_grey;
-        case 16:
-            if(is_grey)
-                return STBI_grey_alpha;
-            // fallthrough
-        case 15:
-            if(is_rgb16)
-                *is_rgb16 = 1;
-            return STBI_rgb;
-        case 24:    // fallthrough
-        case 32:
-            return bits_per_pixel / 8;
-        default:
-            return 0;
+    switch (bits_per_pixel)
+    {
+    case 8:
+        return STBI_grey;
+    case 16:
+        if (is_grey)
+            return STBI_grey_alpha;
+        // fallthrough
+    case 15:
+        if (is_rgb16)
+            *is_rgb16 = 1;
+        return STBI_rgb;
+    case 24: // fallthrough
+    case 32:
+        return bits_per_pixel / 8;
+    default:
+        return 0;
     }
 }
 
@@ -6057,58 +6052,58 @@ static int stbi__tga_info(stbi__context* s, int* x, int* y, int* comp)
 {
     int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
     int sz, tga_colormap_type;
-    stbi__get8(s);    // discard Offset
-    tga_colormap_type = stbi__get8(s);    // colormap type
-    if(tga_colormap_type > 1)
+    stbi__get8(s);                     // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if (tga_colormap_type > 1)
     {
         stbi__rewind(s);
-        return 0;    // only RGB or indexed allowed
+        return 0; // only RGB or indexed allowed
     }
-    tga_image_type = stbi__get8(s);    // image type
-    if(tga_colormap_type == 1)
-    {    // colormapped (paletted) image
-        if(tga_image_type != 1 && tga_image_type != 9)
+    tga_image_type = stbi__get8(s); // image type
+    if (tga_colormap_type == 1)
+    { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9)
         {
             stbi__rewind(s);
             return 0;
         }
-        stbi__skip(s, 4);    // skip index of first colormap entry and number of entries
-        sz = stbi__get8(s);    //   check bits per palette color entry
-        if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+        stbi__skip(s, 4);   // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s); //   check bits per palette color entry
+        if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
         {
             stbi__rewind(s);
             return 0;
         }
-        stbi__skip(s, 4);    // skip image x and y origin
+        stbi__skip(s, 4); // skip image x and y origin
         tga_colormap_bpp = sz;
     }
     else
-    {    // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
-        if((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11))
+    { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11))
         {
             stbi__rewind(s);
-            return 0;    // only RGB or grey allowed, +/- RLE
+            return 0; // only RGB or grey allowed, +/- RLE
         }
-        stbi__skip(s, 9);    // skip colormap specification and image x/y origin
+        stbi__skip(s, 9); // skip colormap specification and image x/y origin
         tga_colormap_bpp = 0;
     }
     tga_w = stbi__get16le(s);
-    if(tga_w < 1)
+    if (tga_w < 1)
     {
         stbi__rewind(s);
-        return 0;    // test width
+        return 0; // test width
     }
     tga_h = stbi__get16le(s);
-    if(tga_h < 1)
+    if (tga_h < 1)
     {
         stbi__rewind(s);
-        return 0;    // test height
+        return 0; // test height
     }
-    tga_bits_per_pixel = stbi__get8(s);    // bits per pixel
-    stbi__get8(s);    // ignore alpha bits
-    if(tga_colormap_bpp != 0)
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s);                      // ignore alpha bits
+    if (tga_colormap_bpp != 0)
     {
-        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16))
+        if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16))
         {
             // when using a colormap, tga_bits_per_pixel is the size of the indexes
             // I don't think anything but 8 or 16bit indexes makes sense
@@ -6121,56 +6116,56 @@ static int stbi__tga_info(stbi__context* s, int* x, int* y, int* comp)
     {
         tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
     }
-    if(!tga_comp)
+    if (!tga_comp)
     {
         stbi__rewind(s);
         return 0;
     }
-    if(x)
+    if (x)
         *x = tga_w;
-    if(y)
+    if (y)
         *y = tga_h;
-    if(comp)
+    if (comp)
         *comp = tga_comp;
-    return 1;    // seems to have passed everything
+    return 1; // seems to have passed everything
 }
 
 static int stbi__tga_test(stbi__context* s)
 {
     int res = 0;
     int sz, tga_color_type;
-    stbi__get8(s);    //   discard Offset
-    tga_color_type = stbi__get8(s);    //   color type
-    if(tga_color_type > 1)
-        goto errorEnd;    //   only RGB or indexed allowed
-    sz = stbi__get8(s);    //   image type
-    if(tga_color_type == 1)
-    {    // colormapped (paletted) image
-        if(sz != 1 && sz != 9)
-            goto errorEnd;    // colortype 1 demands image type 1 or 9
-        stbi__skip(s, 4);    // skip index of first colormap entry and number of entries
-        sz = stbi__get8(s);    //   check bits per palette color entry
-        if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+    stbi__get8(s);                  //   discard Offset
+    tga_color_type = stbi__get8(s); //   color type
+    if (tga_color_type > 1)
+        goto errorEnd;  //   only RGB or indexed allowed
+    sz = stbi__get8(s); //   image type
+    if (tga_color_type == 1)
+    { // colormapped (paletted) image
+        if (sz != 1 && sz != 9)
+            goto errorEnd;  // colortype 1 demands image type 1 or 9
+        stbi__skip(s, 4);   // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s); //   check bits per palette color entry
+        if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
             goto errorEnd;
-        stbi__skip(s, 4);    // skip image x and y origin
+        stbi__skip(s, 4); // skip image x and y origin
     }
     else
-    {    // "normal" image w/o colormap
-        if((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11))
-            goto errorEnd;    // only RGB or grey allowed, +/- RLE
-        stbi__skip(s, 9);    // skip colormap specification and image x/y origin
-    }
-    if(stbi__get16le(s) < 1)
-        goto errorEnd;    //   test width
-    if(stbi__get16le(s) < 1)
-        goto errorEnd;    //   test height
-    sz = stbi__get8(s);    //   bits per pixel
-    if((tga_color_type == 1) && (sz != 8) && (sz != 16))
-        goto errorEnd;    // for colormapped images, bpp is size of an index
-    if((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+    { // "normal" image w/o colormap
+        if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11))
+            goto errorEnd; // only RGB or grey allowed, +/- RLE
+        stbi__skip(s, 9);  // skip colormap specification and image x/y origin
+    }
+    if (stbi__get16le(s) < 1)
+        goto errorEnd; //   test width
+    if (stbi__get16le(s) < 1)
+        goto errorEnd;  //   test height
+    sz = stbi__get8(s); //   bits per pixel
+    if ((tga_color_type == 1) && (sz != 8) && (sz != 16))
+        goto errorEnd; // for colormapped images, bpp is size of an index
+    if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
         goto errorEnd;
 
-    res = 1;    // if we got this far, everything's good and we can return 1 instead of 0
+    res = 1; // if we got this far, everything's good and we can return 1 instead of 0
 
 errorEnd:
     stbi__rewind(s);
@@ -6180,7 +6175,7 @@ static int stbi__tga_test(stbi__context* s)
 // read 16bit value and convert to 24bit RGB
 static void stbi__tga_read_rgb16(stbi__context* s, stbi_uc* out)
 {
-    stbi__uint16 px = ( stbi__uint16 )stbi__get16le(s);
+    stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
     stbi__uint16 fiveBitMask = 31;
     // we have 3 channels with 5bits each
     int r = (px >> 10) & fiveBitMask;
@@ -6226,7 +6221,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
     STBI_NOTUSED(ri);
 
     //   do a tiny bit of precessing
-    if(tga_image_type >= 8)
+    if (tga_image_type >= 8)
     {
         tga_image_type -= 8;
         tga_is_RLE = 1;
@@ -6234,33 +6229,33 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
     tga_inverted = 1 - ((tga_inverted >> 5) & 1);
 
     //   If I'm paletted, then I'll use the number of bits from the palette
-    if(tga_indexed)
+    if (tga_indexed)
         tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
     else
         tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
 
-    if(!tga_comp)    // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+    if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
         return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
 
     //   tga info
     *x = tga_width;
     *y = tga_height;
-    if(comp)
+    if (comp)
         *comp = tga_comp;
 
-    if(!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+    if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
         return stbi__errpuc("too large", "Corrupt TGA");
 
-    tga_data = ( unsigned char* )stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
-    if(!tga_data)
+    tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+    if (!tga_data)
         return stbi__errpuc("outofmem", "Out of memory");
 
     // skip to the data's starting position (offset usually = 0)
     stbi__skip(s, tga_offset);
 
-    if(!tga_indexed && !tga_is_RLE && !tga_rgb16)
+    if (!tga_indexed && !tga_is_RLE && !tga_rgb16)
     {
-        for(i = 0; i < tga_height; ++i)
+        for (i = 0; i < tga_height; ++i)
         {
             int row = tga_inverted ? tga_height - i - 1 : i;
             stbi_uc* tga_row = tga_data + row * tga_width * tga_comp;
@@ -6270,28 +6265,28 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
     else
     {
         //   do I need to load a palette?
-        if(tga_indexed)
+        if (tga_indexed)
         {
             //   any data to skip? (offset usually = 0)
             stbi__skip(s, tga_palette_start);
             //   load the palette
-            tga_palette = ( unsigned char* )stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
-            if(!tga_palette)
+            tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+            if (!tga_palette)
             {
                 STBI_FREE(tga_data);
                 return stbi__errpuc("outofmem", "Out of memory");
             }
-            if(tga_rgb16)
+            if (tga_rgb16)
             {
                 stbi_uc* pal_entry = tga_palette;
                 STBI_ASSERT(tga_comp == STBI_rgb);
-                for(i = 0; i < tga_palette_len; ++i)
+                for (i = 0; i < tga_palette_len; ++i)
                 {
                     stbi__tga_read_rgb16(s, pal_entry);
                     pal_entry += tga_comp;
                 }
             }
-            else if(!stbi__getn(s, tga_palette, tga_palette_len * tga_comp))
+            else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp))
             {
                 STBI_FREE(tga_data);
                 STBI_FREE(tga_palette);
@@ -6299,12 +6294,12 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
             }
         }
         //   load the data
-        for(i = 0; i < tga_width * tga_height; ++i)
+        for (i = 0; i < tga_width * tga_height; ++i)
         {
             //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
-            if(tga_is_RLE)
+            if (tga_is_RLE)
             {
-                if(RLE_count == 0)
+                if (RLE_count == 0)
                 {
                     //   yep, get the next byte as a RLE command
                     int RLE_cmd = stbi__get8(s);
@@ -6312,7 +6307,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
                     RLE_repeating = RLE_cmd >> 7;
                     read_next_pixel = 1;
                 }
-                else if(!RLE_repeating)
+                else if (!RLE_repeating)
                 {
                     read_next_pixel = 1;
                 }
@@ -6322,25 +6317,25 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
                 read_next_pixel = 1;
             }
             //   OK, if I need to read a pixel, do it now
-            if(read_next_pixel)
+            if (read_next_pixel)
             {
                 //   load however much data we did have
-                if(tga_indexed)
+                if (tga_indexed)
                 {
                     // read in index, then perform the lookup
                     int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
-                    if(pal_idx >= tga_palette_len)
+                    if (pal_idx >= tga_palette_len)
                     {
                         // invalid index
                         pal_idx = 0;
                     }
                     pal_idx *= tga_comp;
-                    for(j = 0; j < tga_comp; ++j)
+                    for (j = 0; j < tga_comp; ++j)
                     {
                         raw_data[j] = tga_palette[pal_idx + j];
                     }
                 }
-                else if(tga_rgb16)
+                else if (tga_rgb16)
                 {
                     STBI_ASSERT(tga_comp == STBI_rgb);
                     stbi__tga_read_rgb16(s, raw_data);
@@ -6348,30 +6343,30 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
                 else
                 {
                     //   read in the data raw
-                    for(j = 0; j < tga_comp; ++j)
+                    for (j = 0; j < tga_comp; ++j)
                     {
                         raw_data[j] = stbi__get8(s);
                     }
                 }
                 //   clear the reading flag for the next pixel
                 read_next_pixel = 0;
-            }    // end of reading a pixel
+            } // end of reading a pixel
 
             // copy data
-            for(j = 0; j < tga_comp; ++j)
+            for (j = 0; j < tga_comp; ++j)
                 tga_data[i * tga_comp + j] = raw_data[j];
 
             //   in case we're in RLE mode, keep counting down
             --RLE_count;
         }
         //   do I need to invert the image?
-        if(tga_inverted)
+        if (tga_inverted)
         {
-            for(j = 0; j * 2 < tga_height; ++j)
+            for (j = 0; j * 2 < tga_height; ++j)
             {
                 int index1 = j * tga_width * tga_comp;
                 int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
-                for(i = tga_width * tga_comp; i > 0; --i)
+                for (i = tga_width * tga_comp; i > 0; --i)
                 {
                     unsigned char temp = tga_data[index1];
                     tga_data[index1] = tga_data[index2];
@@ -6382,17 +6377,17 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
             }
         }
         //   clear my palette, if I had one
-        if(tga_palette != NULL)
+        if (tga_palette != NULL)
         {
             STBI_FREE(tga_palette);
         }
     }
 
     // swap RGB - if the source data was RGB16, it already is in the right order
-    if(tga_comp >= 3 && !tga_rgb16)
+    if (tga_comp >= 3 && !tga_rgb16)
     {
         unsigned char* tga_pixel = tga_data;
-        for(i = 0; i < tga_width * tga_height; ++i)
+        for (i = 0; i < tga_width * tga_height; ++i)
         {
             unsigned char temp = tga_pixel[0];
             tga_pixel[0] = tga_pixel[2];
@@ -6402,7 +6397,7 @@ static void* stbi__tga_load(stbi__context* s, int* x, int* y, int* comp, int req
     }
 
     // convert to target component count
-    if(req_comp && req_comp != tga_comp)
+    if (req_comp && req_comp != tga_comp)
         tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
 
     //   the things I do to get rid of an error message, and yet keep
@@ -6429,38 +6424,38 @@ static int stbi__psd_decode_rle(stbi__context* s, stbi_uc* p, int pixelCount)
     int count, nleft, len;
 
     count = 0;
-    while((nleft = pixelCount - count) > 0)
+    while ((nleft = pixelCount - count) > 0)
     {
         len = stbi__get8(s);
-        if(len == 128)
+        if (len == 128)
         {
             // No-op.
         }
-        else if(len < 128)
+        else if (len < 128)
         {
             // Copy next len+1 bytes literally.
             len++;
-            if(len > nleft)
-                return 0;    // corrupt data
+            if (len > nleft)
+                return 0; // corrupt data
             count += len;
-            while(len)
+            while (len)
             {
                 *p = stbi__get8(s);
                 p += 4;
                 len--;
             }
         }
-        else if(len > 128)
+        else if (len > 128)
         {
             stbi_uc val;
             // Next -len+1 bytes in the dest are replicated from next source byte.
             // (Interpret len as a negative 8-bit int.)
             len = 257 - len;
-            if(len > nleft)
-                return 0;    // corrupt data
+            if (len > nleft)
+                return 0; // corrupt data
             val = stbi__get8(s);
             count += len;
-            while(len)
+            while (len)
             {
                 *p = val;
                 p += 4;
@@ -6483,11 +6478,11 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     STBI_NOTUSED(ri);
 
     // Check identifier
-    if(stbi__get32be(s) != 0x38425053)    // "8BPS"
+    if (stbi__get32be(s) != 0x38425053) // "8BPS"
         return stbi__errpuc("not PSD", "Corrupt PSD image");
 
     // Check file type version.
-    if(stbi__get16be(s) != 1)
+    if (stbi__get16be(s) != 1)
         return stbi__errpuc("wrong version", "Unsupported version of PSD image");
 
     // Skip 6 reserved bytes.
@@ -6495,7 +6490,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
 
     // Read the number of channels (R, G, B, A, etc).
     channelCount = stbi__get16be(s);
-    if(channelCount < 0 || channelCount > 16)
+    if (channelCount < 0 || channelCount > 16)
         return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
 
     // Read the rows and columns of the image.
@@ -6504,7 +6499,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
 
     // Make sure the depth is 8 bits.
     bitdepth = stbi__get16be(s);
-    if(bitdepth != 8 && bitdepth != 16)
+    if (bitdepth != 8 && bitdepth != 16)
         return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
 
     // Make sure the color mode is RGB.
@@ -6517,7 +6512,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     //   7: Multichannel
     //   8: Duotone
     //   9: Lab color
-    if(stbi__get16be(s) != 3)
+    if (stbi__get16be(s) != 3)
         return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
 
     // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
@@ -6534,24 +6529,24 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     //   0: no compression
     //   1: RLE compressed
     compression = stbi__get16be(s);
-    if(compression > 1)
+    if (compression > 1)
         return stbi__errpuc("bad compression", "PSD has an unknown compression format");
 
     // Check size
-    if(!stbi__mad3sizes_valid(4, w, h, 0))
+    if (!stbi__mad3sizes_valid(4, w, h, 0))
         return stbi__errpuc("too large", "Corrupt PSD");
 
     // Create the destination image.
 
-    if(!compression && bitdepth == 16 && bpc == 16)
+    if (!compression && bitdepth == 16 && bpc == 16)
     {
-        out = ( stbi_uc* )stbi__malloc_mad3(8, w, h, 0);
+        out = (stbi_uc*)stbi__malloc_mad3(8, w, h, 0);
         ri->bits_per_channel = 16;
     }
     else
-        out = ( stbi_uc* )stbi__malloc(4 * (size_t)w * h);
+        out = (stbi_uc*)stbi__malloc(4 * (size_t)w * h);
 
-    if(!out)
+    if (!out)
         return stbi__errpuc("outofmem", "Out of memory");
     pixelCount = w * h;
 
@@ -6559,7 +6554,7 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     // memset( out, 0, pixelCount * 4 );
 
     // Finally, the image data.
-    if(compression)
+    if (compression)
     {
         // RLE as used by .PSD and .TIFF
         // Loop until you get the number of unpacked bytes you are expecting:
@@ -6574,21 +6569,21 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
         stbi__skip(s, h * channelCount * 2);
 
         // Read the RLE data by channel.
-        for(channel = 0; channel < 4; channel++)
+        for (channel = 0; channel < 4; channel++)
         {
             stbi_uc* p;
 
             p = out + channel;
-            if(channel >= channelCount)
+            if (channel >= channelCount)
             {
                 // Fill this channel with default data.
-                for(i = 0; i < pixelCount; i++, p += 4)
+                for (i = 0; i < pixelCount; i++, p += 4)
                     *p = (channel == 3 ? 255 : 0);
             }
             else
             {
                 // Read the RLE data.
-                if(!stbi__psd_decode_rle(s, p, pixelCount))
+                if (!stbi__psd_decode_rle(s, p, pixelCount))
                 {
                     STBI_FREE(out);
                     return stbi__errpuc("corrupt", "bad RLE data");
@@ -6602,45 +6597,45 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
         // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
 
         // Read the data by channel.
-        for(channel = 0; channel < 4; channel++)
+        for (channel = 0; channel < 4; channel++)
         {
-            if(channel >= channelCount)
+            if (channel >= channelCount)
             {
                 // Fill this channel with default data.
-                if(bitdepth == 16 && bpc == 16)
+                if (bitdepth == 16 && bpc == 16)
                 {
-                    stbi__uint16* q = (( stbi__uint16* )out) + channel;
+                    stbi__uint16* q = ((stbi__uint16*)out) + channel;
                     stbi__uint16 val = channel == 3 ? 65535 : 0;
-                    for(i = 0; i < pixelCount; i++, q += 4)
+                    for (i = 0; i < pixelCount; i++, q += 4)
                         *q = val;
                 }
                 else
                 {
                     stbi_uc* p = out + channel;
                     stbi_uc val = channel == 3 ? 255 : 0;
-                    for(i = 0; i < pixelCount; i++, p += 4)
+                    for (i = 0; i < pixelCount; i++, p += 4)
                         *p = val;
                 }
             }
             else
             {
-                if(ri->bits_per_channel == 16)
-                {    // output bpc
-                    stbi__uint16* q = (( stbi__uint16* )out) + channel;
-                    for(i = 0; i < pixelCount; i++, q += 4)
-                        *q = ( stbi__uint16 )stbi__get16be(s);
+                if (ri->bits_per_channel == 16)
+                { // output bpc
+                    stbi__uint16* q = ((stbi__uint16*)out) + channel;
+                    for (i = 0; i < pixelCount; i++, q += 4)
+                        *q = (stbi__uint16)stbi__get16be(s);
                 }
                 else
                 {
                     stbi_uc* p = out + channel;
-                    if(bitdepth == 16)
-                    {    // input bpc
-                        for(i = 0; i < pixelCount; i++, p += 4)
+                    if (bitdepth == 16)
+                    { // input bpc
+                        for (i = 0; i < pixelCount; i++, p += 4)
                             *p = (stbi_uc)(stbi__get16be(s) >> 8);
                     }
                     else
                     {
-                        for(i = 0; i < pixelCount; i++, p += 4)
+                        for (i = 0; i < pixelCount; i++, p += 4)
                             *p = stbi__get8(s);
                     }
                 }
@@ -6649,14 +6644,14 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
     }
 
     // remove weird white matte from PSD
-    if(channelCount >= 4)
+    if (channelCount >= 4)
     {
-        if(ri->bits_per_channel == 16)
+        if (ri->bits_per_channel == 16)
         {
-            for(i = 0; i < w * h; ++i)
+            for (i = 0; i < w * h; ++i)
             {
-                stbi__uint16* pixel = ( stbi__uint16* )out + 4 * i;
-                if(pixel[3] != 0 && pixel[3] != 65535)
+                stbi__uint16* pixel = (stbi__uint16*)out + 4 * i;
+                if (pixel[3] != 0 && pixel[3] != 65535)
                 {
                     float a = pixel[3] / 65535.0f;
                     float ra = 1.0f / a;
@@ -6669,34 +6664,34 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
         }
         else
         {
-            for(i = 0; i < w * h; ++i)
+            for (i = 0; i < w * h; ++i)
             {
                 unsigned char* pixel = out + 4 * i;
-                if(pixel[3] != 0 && pixel[3] != 255)
+                if (pixel[3] != 0 && pixel[3] != 255)
                 {
                     float a = pixel[3] / 255.0f;
                     float ra = 1.0f / a;
                     float inv_a = 255.0f * (1 - ra);
-                    pixel[0] = ( unsigned char )(pixel[0] * ra + inv_a);
-                    pixel[1] = ( unsigned char )(pixel[1] * ra + inv_a);
-                    pixel[2] = ( unsigned char )(pixel[2] * ra + inv_a);
+                    pixel[0] = (unsigned char)(pixel[0] * ra + inv_a);
+                    pixel[1] = (unsigned char)(pixel[1] * ra + inv_a);
+                    pixel[2] = (unsigned char)(pixel[2] * ra + inv_a);
                 }
             }
         }
     }
 
     // convert to desired output format
-    if(req_comp && req_comp != 4)
+    if (req_comp && req_comp != 4)
     {
-        if(ri->bits_per_channel == 16)
-            out = ( stbi_uc* )stbi__convert_format16(( stbi__uint16* )out, 4, req_comp, w, h);
+        if (ri->bits_per_channel == 16)
+            out = (stbi_uc*)stbi__convert_format16((stbi__uint16*)out, 4, req_comp, w, h);
         else
             out = stbi__convert_format(out, 4, req_comp, w, h);
-        if(out == NULL)
-            return out;    // stbi__convert_format frees input on failure
+        if (out == NULL)
+            return out; // stbi__convert_format frees input on failure
     }
 
-    if(comp)
+    if (comp)
         *comp = 4;
     *y = h;
     *x = w;
@@ -6716,8 +6711,8 @@ static void* stbi__psd_load(stbi__context* s, int* x, int* y, int* comp, int req
 static int stbi__pic_is4(stbi__context* s, const char* str)
 {
     int i;
-    for(i = 0; i < 4; ++i)
-        if(stbi__get8(s) != ( stbi_uc )str[i])
+    for (i = 0; i < 4; ++i)
+        if (stbi__get8(s) != (stbi_uc)str[i])
             return 0;
 
     return 1;
@@ -6727,13 +6722,13 @@ static int stbi__pic_test_core(stbi__context* s)
 {
     int i;
 
-    if(!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
+    if (!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
         return 0;
 
-    for(i = 0; i < 84; ++i)
+    for (i = 0; i < 84; ++i)
         stbi__get8(s);
 
-    if(!stbi__pic_is4(s, "PICT"))
+    if (!stbi__pic_is4(s, "PICT"))
         return 0;
 
     return 1;
@@ -6748,11 +6743,11 @@ static stbi_uc* stbi__readval(stbi__context* s, int channel, stbi_uc* dest)
 {
     int mask = 0x80, i;
 
-    for(i = 0; i < 4; ++i, mask >>= 1)
+    for (i = 0; i < 4; ++i, mask >>= 1)
     {
-        if(channel & mask)
+        if (channel & mask)
         {
-            if(stbi__at_eof(s))
+            if (stbi__at_eof(s))
                 return stbi__errpuc("bad file", "PIC file too short");
             dest[i] = stbi__get8(s);
         }
@@ -6765,8 +6760,8 @@ static void stbi__copyval(int channel, stbi_uc* dest, const stbi_uc* src)
 {
     int mask = 0x80, i;
 
-    for(i = 0; i < 4; ++i, mask >>= 1)
-        if(channel & mask)
+    for (i = 0; i < 4; ++i, mask >>= 1)
+        if (channel & mask)
             dest[i] = src[i];
 }
 
@@ -6781,7 +6776,7 @@ static stbi_uc* stbi__pic_load_core(stbi__context* s, int width, int height, int
     {
         stbi__pic_packet* packet;
 
-        if(num_packets == sizeof(packets) / sizeof(packets[0]))
+        if (num_packets == sizeof(packets) / sizeof(packets[0]))
             return stbi__errpuc("bad format", "too many packets");
 
         packet = &packets[num_packets++];
@@ -6793,103 +6788,103 @@ static stbi_uc* stbi__pic_load_core(stbi__context* s, int width, int height, int
 
         act_comp |= packet->channel;
 
-        if(stbi__at_eof(s))
+        if (stbi__at_eof(s))
             return stbi__errpuc("bad file", "file too short (reading packets)");
-        if(packet->size != 8)
+        if (packet->size != 8)
             return stbi__errpuc("bad format", "packet isn't 8bpp");
-    } while(chained);
+    } while (chained);
 
-    *comp = (act_comp & 0x10 ? 4 : 3);    // has alpha channel?
+    *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
 
-    for(y = 0; y < height; ++y)
+    for (y = 0; y < height; ++y)
     {
         int packet_idx;
 
-        for(packet_idx = 0; packet_idx < num_packets; ++packet_idx)
+        for (packet_idx = 0; packet_idx < num_packets; ++packet_idx)
         {
             stbi__pic_packet* packet = &packets[packet_idx];
             stbi_uc* dest = result + y * width * 4;
 
-            switch(packet->type)
+            switch (packet->type)
             {
-                default:
-                    return stbi__errpuc("bad format", "packet has bad compression type");
+            default:
+                return stbi__errpuc("bad format", "packet has bad compression type");
 
-                case 0:
-                {    // uncompressed
-                    int x;
+            case 0:
+            { // uncompressed
+                int x;
 
-                    for(x = 0; x < width; ++x, dest += 4)
-                        if(!stbi__readval(s, packet->channel, dest))
-                            return 0;
-                    break;
-                }
+                for (x = 0; x < width; ++x, dest += 4)
+                    if (!stbi__readval(s, packet->channel, dest))
+                        return 0;
+                break;
+            }
 
-                case 1:    // Pure RLE
-                {
-                    int left = width, i;
+            case 1: // Pure RLE
+            {
+                int left = width, i;
 
-                    while(left > 0)
-                    {
-                        stbi_uc count, value[4];
+                while (left > 0)
+                {
+                    stbi_uc count, value[4];
 
-                        count = stbi__get8(s);
-                        if(stbi__at_eof(s))
-                            return stbi__errpuc("bad file", "file too short (pure read count)");
+                    count = stbi__get8(s);
+                    if (stbi__at_eof(s))
+                        return stbi__errpuc("bad file", "file too short (pure read count)");
 
-                        if(count > left)
-                            count = ( stbi_uc )left;
+                    if (count > left)
+                        count = (stbi_uc)left;
 
-                        if(!stbi__readval(s, packet->channel, value))
-                            return 0;
+                    if (!stbi__readval(s, packet->channel, value))
+                        return 0;
 
-                        for(i = 0; i < count; ++i, dest += 4)
-                            stbi__copyval(packet->channel, dest, value);
-                        left -= count;
-                    }
+                    for (i = 0; i < count; ++i, dest += 4)
+                        stbi__copyval(packet->channel, dest, value);
+                    left -= count;
                 }
-                break;
+            }
+            break;
 
-                case 2:
-                {    // Mixed RLE
-                    int left = width;
-                    while(left > 0)
-                    {
-                        int count = stbi__get8(s), i;
-                        if(stbi__at_eof(s))
-                            return stbi__errpuc("bad file", "file too short (mixed read count)");
+            case 2:
+            { // Mixed RLE
+                int left = width;
+                while (left > 0)
+                {
+                    int count = stbi__get8(s), i;
+                    if (stbi__at_eof(s))
+                        return stbi__errpuc("bad file", "file too short (mixed read count)");
 
-                        if(count >= 128)
-                        {    // Repeated
-                            stbi_uc value[4];
+                    if (count >= 128)
+                    { // Repeated
+                        stbi_uc value[4];
 
-                            if(count == 128)
-                                count = stbi__get16be(s);
-                            else
-                                count -= 127;
-                            if(count > left)
-                                return stbi__errpuc("bad file", "scanline overrun");
+                        if (count == 128)
+                            count = stbi__get16be(s);
+                        else
+                            count -= 127;
+                        if (count > left)
+                            return stbi__errpuc("bad file", "scanline overrun");
 
-                            if(!stbi__readval(s, packet->channel, value))
-                                return 0;
+                        if (!stbi__readval(s, packet->channel, value))
+                            return 0;
 
-                            for(i = 0; i < count; ++i, dest += 4)
-                                stbi__copyval(packet->channel, dest, value);
-                        }
-                        else
-                        {    // Raw
-                            ++count;
-                            if(count > left)
-                                return stbi__errpuc("bad file", "scanline overrun");
+                        for (i = 0; i < count; ++i, dest += 4)
+                            stbi__copyval(packet->channel, dest, value);
+                    }
+                    else
+                    { // Raw
+                        ++count;
+                        if (count > left)
+                            return stbi__errpuc("bad file", "scanline overrun");
 
-                            for(i = 0; i < count; ++i, dest += 4)
-                                if(!stbi__readval(s, packet->channel, dest))
-                                    return 0;
-                        }
-                        left -= count;
+                        for (i = 0; i < count; ++i, dest += 4)
+                            if (!stbi__readval(s, packet->channel, dest))
+                                return 0;
                     }
-                    break;
+                    left -= count;
                 }
+                break;
+            }
             }
         }
     }
@@ -6903,35 +6898,35 @@ static void* stbi__pic_load(stbi__context* s, int* px, int* py, int* comp, int r
     int i, x, y, internal_comp;
     STBI_NOTUSED(ri);
 
-    if(!comp)
+    if (!comp)
         comp = &internal_comp;
 
-    for(i = 0; i < 92; ++i)
+    for (i = 0; i < 92; ++i)
         stbi__get8(s);
 
     x = stbi__get16be(s);
     y = stbi__get16be(s);
-    if(stbi__at_eof(s))
+    if (stbi__at_eof(s))
         return stbi__errpuc("bad file", "file too short (pic header)");
-    if(!stbi__mad3sizes_valid(x, y, 4, 0))
+    if (!stbi__mad3sizes_valid(x, y, 4, 0))
         return stbi__errpuc("too large", "PIC image too large to decode");
 
-    stbi__get32be(s);    // skip `ratio'
-    stbi__get16be(s);    // skip `fields'
-    stbi__get16be(s);    // skip `pad'
+    stbi__get32be(s); // skip `ratio'
+    stbi__get16be(s); // skip `fields'
+    stbi__get16be(s); // skip `pad'
 
     // intermediate buffer is RGBA
-    result = ( stbi_uc* )stbi__malloc_mad3(x, y, 4, 0);
+    result = (stbi_uc*)stbi__malloc_mad3(x, y, 4, 0);
     memset(result, 0xff, (size_t)x * y * 4);
 
-    if(!stbi__pic_load_core(s, x, y, comp, result))
+    if (!stbi__pic_load_core(s, x, y, comp, result))
     {
         STBI_FREE(result);
         result = 0;
     }
     *px = x;
     *py = y;
-    if(req_comp == 0)
+    if (req_comp == 0)
         req_comp = *comp;
     result = stbi__convert_format(result, 4, req_comp, x, y);
 
@@ -6960,8 +6955,8 @@ typedef struct
 typedef struct
 {
     int w, h;
-    stbi_uc* out;    // output buffer (always 4 components)
-    stbi_uc* background;    // The current "background" as far as a gif is concerned
+    stbi_uc* out;        // output buffer (always 4 components)
+    stbi_uc* background; // The current "background" as far as a gif is concerned
     stbi_uc* history;
     int flags, bgindex, ratio, transparent, eflags;
     stbi_uc pal[256][4];
@@ -6980,12 +6975,12 @@ typedef struct
 static int stbi__gif_test_raw(stbi__context* s)
 {
     int sz;
-    if(stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
         return 0;
     sz = stbi__get8(s);
-    if(sz != '9' && sz != '7')
+    if (sz != '9' && sz != '7')
         return 0;
-    if(stbi__get8(s) != 'a')
+    if (stbi__get8(s) != 'a')
         return 0;
     return 1;
 }
@@ -7000,7 +6995,7 @@ static int stbi__gif_test(stbi__context* s)
 static void stbi__gif_parse_colortable(stbi__context* s, stbi_uc pal[256][4], int num_entries, int transp)
 {
     int i;
-    for(i = 0; i < num_entries; ++i)
+    for (i = 0; i < num_entries; ++i)
     {
         pal[i][2] = stbi__get8(s);
         pal[i][1] = stbi__get8(s);
@@ -7012,13 +7007,13 @@ static void stbi__gif_parse_colortable(stbi__context* s, stbi_uc pal[256][4], in
 static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_info)
 {
     stbi_uc version;
-    if(stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
         return stbi__err("not GIF", "Corrupt GIF");
 
     version = stbi__get8(s);
-    if(version != '7' && version != '9')
+    if (version != '7' && version != '9')
         return stbi__err("not GIF", "Corrupt GIF");
-    if(stbi__get8(s) != 'a')
+    if (stbi__get8(s) != 'a')
         return stbi__err("not GIF", "Corrupt GIF");
 
     stbi__g_failure_reason = "";
@@ -7029,13 +7024,13 @@ static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_in
     g->ratio = stbi__get8(s);
     g->transparent = -1;
 
-    if(comp != 0)
-        *comp = 4;    // can't actually tell whether it's 3 or 4 until we parse the comments
+    if (comp != 0)
+        *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments
 
-    if(is_info)
+    if (is_info)
         return 1;
 
-    if(g->flags & 0x80)
+    if (g->flags & 0x80)
         stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1);
 
     return 1;
@@ -7043,16 +7038,16 @@ static int stbi__gif_header(stbi__context* s, stbi__gif* g, int* comp, int is_in
 
 static int stbi__gif_info_raw(stbi__context* s, int* x, int* y, int* comp)
 {
-    stbi__gif* g = ( stbi__gif* )stbi__malloc(sizeof(stbi__gif));
-    if(!stbi__gif_header(s, g, comp, 1))
+    stbi__gif* g = (stbi__gif*)stbi__malloc(sizeof(stbi__gif));
+    if (!stbi__gif_header(s, g, comp, 1))
     {
         STBI_FREE(g);
         stbi__rewind(s);
         return 0;
     }
-    if(x)
+    if (x)
         *x = g->w;
-    if(y)
+    if (y)
         *y = g->h;
     STBI_FREE(g);
     return 1;
@@ -7065,10 +7060,10 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code)
 
     // recurse to decode the prefixes, since the linked-list is backwards,
     // and working backwards through an interleaved image would be nasty
-    if(g->codes[code].prefix >= 0)
+    if (g->codes[code].prefix >= 0)
         stbi__out_gif_code(g, g->codes[code].prefix);
 
-    if(g->cur_y >= g->max_y)
+    if (g->cur_y >= g->max_y)
         return;
 
     idx = g->cur_x + g->cur_y;
@@ -7076,8 +7071,8 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code)
     g->history[idx / 4] = 1;
 
     c = &g->color_table[g->codes[code].suffix * 4];
-    if(c[3] > 128)
-    {    // don't render transparent pixels;
+    if (c[3] > 128)
+    { // don't render transparent pixels;
         p[0] = c[2];
         p[1] = c[1];
         p[2] = c[0];
@@ -7085,12 +7080,12 @@ static void stbi__out_gif_code(stbi__gif* g, stbi__uint16 code)
     }
     g->cur_x += 4;
 
-    if(g->cur_x >= g->max_x)
+    if (g->cur_x >= g->max_x)
     {
         g->cur_x = g->start_x;
         g->cur_y += g->step;
 
-        while(g->cur_y >= g->max_y && g->parse > 0)
+        while (g->cur_y >= g->max_y && g->parse > 0)
         {
             g->step = (1 << g->parse) * g->line_size;
             g->cur_y = g->start_y + (g->step >> 1);
@@ -7108,7 +7103,7 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g)
     stbi__gif_lzw* p;
 
     lzw_cs = stbi__get8(s);
-    if(lzw_cs > 12)
+    if (lzw_cs > 12)
         return NULL;
     clear = 1 << lzw_cs;
     first = 1;
@@ -7116,11 +7111,11 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g)
     codemask = (1 << codesize) - 1;
     bits = 0;
     valid_bits = 0;
-    for(init_code = 0; init_code < clear; init_code++)
+    for (init_code = 0; init_code < clear; init_code++)
     {
         g->codes[init_code].prefix = -1;
-        g->codes[init_code].first = ( stbi_uc )init_code;
-        g->codes[init_code].suffix = ( stbi_uc )init_code;
+        g->codes[init_code].first = (stbi_uc)init_code;
+        g->codes[init_code].suffix = (stbi_uc)init_code;
     }
 
     // support no starting clear code
@@ -7128,18 +7123,18 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g)
     oldcode = -1;
 
     len = 0;
-    for(;;)
+    for (;;)
     {
-        if(valid_bits < codesize)
+        if (valid_bits < codesize)
         {
-            if(len == 0)
+            if (len == 0)
             {
-                len = stbi__get8(s);    // start new block
-                if(len == 0)
+                len = stbi__get8(s); // start new block
+                if (len == 0)
                     return g->out;
             }
             --len;
-            bits |= ( stbi__int32 )stbi__get8(s) << valid_bits;
+            bits |= (stbi__int32)stbi__get8(s) << valid_bits;
             valid_bits += 8;
         }
         else
@@ -7148,46 +7143,46 @@ static stbi_uc* stbi__process_gif_raster(stbi__context* s, stbi__gif* g)
             bits >>= codesize;
             valid_bits -= codesize;
             // @OPTIMIZE: is there some way we can accelerate the non-clear path?
-            if(code == clear)
-            {    // clear code
+            if (code == clear)
+            { // clear code
                 codesize = lzw_cs + 1;
                 codemask = (1 << codesize) - 1;
                 avail = clear + 2;
                 oldcode = -1;
                 first = 0;
             }
-            else if(code == clear + 1)
-            {    // end of stream code
+            else if (code == clear + 1)
+            { // end of stream code
                 stbi__skip(s, len);
-                while((len = stbi__get8(s)) > 0)
+                while ((len = stbi__get8(s)) > 0)
                     stbi__skip(s, len);
                 return g->out;
             }
-            else if(code <= avail)
+            else if (code <= avail)
             {
-                if(first)
+                if (first)
                 {
                     return stbi__errpuc("no clear code", "Corrupt GIF");
                 }
 
-                if(oldcode >= 0)
+                if (oldcode >= 0)
                 {
                     p = &g->codes[avail++];
-                    if(avail > 8192)
+                    if (avail > 8192)
                     {
                         return stbi__errpuc("too many codes", "Corrupt GIF");
                     }
 
-                    p->prefix = ( stbi__int16 )oldcode;
+                    p->prefix = (stbi__int16)oldcode;
                     p->first = g->codes[oldcode].first;
                     p->suffix = (code == avail) ? p->first : g->codes[code].first;
                 }
-                else if(code == avail)
+                else if (code == avail)
                     return stbi__errpuc("illegal code in raster", "Corrupt GIF");
 
-                stbi__out_gif_code(g, ( stbi__uint16 )code);
+                stbi__out_gif_code(g, (stbi__uint16)code);
 
-                if((avail & codemask) == 0 && avail <= 0x0FFF)
+                if ((avail & codemask) == 0 && avail <= 0x0FFF)
                 {
                     codesize++;
                     codemask = (1 << codesize) - 1;
@@ -7214,22 +7209,22 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i
 
     // on first frame, any non-written pixels get the background colour (non-transparent)
     first_frame = 0;
-    if(g->out == 0)
-    {
-        if(!stbi__gif_header(s, g, comp, 0))
-            return 0;    // stbi__g_failure_reason set by stbi__gif_header
-        g->out = ( stbi_uc* )stbi__malloc(4 * (size_t)(g->w) * g->h);
-        g->background = ( stbi_uc* )stbi__malloc(4 * (size_t)(g->w) * g->h);
-        g->history = ( stbi_uc* )stbi__malloc((size_t)(g->w) * g->h);
-        if(g->out == 0)
+    if (g->out == 0)
+    {
+        if (!stbi__gif_header(s, g, comp, 0))
+            return 0; // stbi__g_failure_reason set by stbi__gif_header
+        g->out = (stbi_uc*)stbi__malloc(4 * (size_t)(g->w) * g->h);
+        g->background = (stbi_uc*)stbi__malloc(4 * (size_t)(g->w) * g->h);
+        g->history = (stbi_uc*)stbi__malloc((size_t)(g->w) * g->h);
+        if (g->out == 0)
             return stbi__errpuc("outofmem", "Out of memory");
 
         // image is treated as "tranparent" at the start - ie, nothing overwrites the current background;
         // background colour is only used for pixels that are not rendered first frame, after that "background"
         // color refers to teh color that was there the previous frame.
         memset(g->out, 0x00, 4 * (size_t)(g->w) * g->h);
-        memset(g->background, 0x00, 4 * (size_t)(g->w) * g->h);    // state of the background (starts transparent)
-        memset(g->history, 0x00, (size_t)(g->w) * g->h);    // pixels that were affected previous frame
+        memset(g->background, 0x00, 4 * (size_t)(g->w) * g->h); // state of the background (starts transparent)
+        memset(g->history, 0x00, (size_t)(g->w) * g->h);        // pixels that were affected previous frame
         first_frame = 1;
     }
     else
@@ -7238,27 +7233,27 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i
         dispose = (g->eflags & 0x1C) >> 2;
         pcount = g->w * g->h;
 
-        if((dispose == 3) && (two_back == 0))
+        if ((dispose == 3) && (two_back == 0))
         {
-            dispose = 2;    // if I don't have an image to revert back to, default to the old background
+            dispose = 2; // if I don't have an image to revert back to, default to the old background
         }
 
-        if(dispose == 3)
-        {    // use previous graphic
-            for(pi = 0; pi < pcount; ++pi)
+        if (dispose == 3)
+        { // use previous graphic
+            for (pi = 0; pi < pcount; ++pi)
             {
-                if(g->history[pi])
+                if (g->history[pi])
                 {
                     memcpy(&g->out[pi * 4], &two_back[pi * 4], 4);
                 }
             }
         }
-        else if(dispose == 2)
+        else if (dispose == 2)
         {
             // restore what was changed last frame to background before that frame;
-            for(pi = 0; pi < pcount; ++pi)
+            for (pi = 0; pi < pcount; ++pi)
             {
-                if(g->history[pi])
+                if (g->history[pi])
                 {
                     memcpy(&g->out[pi * 4], &g->background[pi * 4], 4);
                 }
@@ -7277,139 +7272,139 @@ static stbi_uc* stbi__gif_load_next(stbi__context* s, stbi__gif* g, int* comp, i
     }
 
     // clear my history;
-    memset(g->history, 0x00, (size_t)(g->w) * g->h);    // pixels that were affected previous frame
+    memset(g->history, 0x00, (size_t)(g->w) * g->h); // pixels that were affected previous frame
 
-    for(;;)
+    for (;;)
     {
         int tag = stbi__get8(s);
-        switch(tag)
+        switch (tag)
+        {
+        case 0x2C: /* Image Descriptor */
         {
-            case 0x2C: /* Image Descriptor */
+            stbi__int32 x, y, w, h;
+            stbi_uc* o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+                return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x = g->start_x + w * 4;
+            g->max_y = g->start_y + h * g->line_size;
+            g->cur_x = g->start_x;
+            g->cur_y = g->start_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40)
             {
-                stbi__int32 x, y, w, h;
-                stbi_uc* o;
-
-                x = stbi__get16le(s);
-                y = stbi__get16le(s);
-                w = stbi__get16le(s);
-                h = stbi__get16le(s);
-                if(((x + w) > (g->w)) || ((y + h) > (g->h)))
-                    return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
-
-                g->line_size = g->w * 4;
-                g->start_x = x * 4;
-                g->start_y = y * g->line_size;
-                g->max_x = g->start_x + w * 4;
-                g->max_y = g->start_y + h * g->line_size;
-                g->cur_x = g->start_x;
-                g->cur_y = g->start_y;
-
-                g->lflags = stbi__get8(s);
-
-                if(g->lflags & 0x40)
-                {
-                    g->step = 8 * g->line_size;    // first interlaced spacing
-                    g->parse = 3;
-                }
-                else
-                {
-                    g->step = g->line_size;
-                    g->parse = 0;
-                }
+                g->step = 8 * g->line_size; // first interlaced spacing
+                g->parse = 3;
+            }
+            else
+            {
+                g->step = g->line_size;
+                g->parse = 0;
+            }
 
-                if(g->lflags & 0x80)
-                {
-                    stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7),
-                                               g->eflags & 0x01 ? g->transparent : -1);
-                    g->color_table = ( stbi_uc* )g->lpal;
-                }
-                else if(g->flags & 0x80)
-                {
-                    g->color_table = ( stbi_uc* )g->pal;
-                }
-                else
-                    return stbi__errpuc("missing color table", "Corrupt GIF");
+            if (g->lflags & 0x80)
+            {
+                stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7),
+                                           g->eflags & 0x01 ? g->transparent : -1);
+                g->color_table = (stbi_uc*)g->lpal;
+            }
+            else if (g->flags & 0x80)
+            {
+                g->color_table = (stbi_uc*)g->pal;
+            }
+            else
+                return stbi__errpuc("missing color table", "Corrupt GIF");
 
-                o = stbi__process_gif_raster(s, g);
-                if(o == NULL)
-                    return NULL;
+            o = stbi__process_gif_raster(s, g);
+            if (o == NULL)
+                return NULL;
 
-                // if this was the first frame,
-                pcount = g->w * g->h;
-                if(first_frame && (g->bgindex > 0))
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0))
+            {
+                // if first frame, any pixel not drawn to gets the background color
+                for (pi = 0; pi < pcount; ++pi)
                 {
-                    // if first frame, any pixel not drawn to gets the background color
-                    for(pi = 0; pi < pcount; ++pi)
+                    if (g->history[pi] == 0)
                     {
-                        if(g->history[pi] == 0)
-                        {
-                            g->pal[g->bgindex][3] = 255;    // just in case it was made transparent, undo that; It will
-                                                            // be reset next frame if need be;
-                            memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4);
-                        }
+                        g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will
+                                                     // be reset next frame if need be;
+                        memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4);
                     }
                 }
-
-                return o;
             }
 
-            case 0x21:    // Comment Extension.
-            {
-                int len;
-                int ext = stbi__get8(s);
-                if(ext == 0xF9)
-                {    // Graphic Control Extension.
-                    len = stbi__get8(s);
-                    if(len == 4)
-                    {
-                        g->eflags = stbi__get8(s);
-                        g->delay = 10 * stbi__get16le(s);    // delay - 1/100th of a second, saving as 1/1000ths.
+            return o;
+        }
 
-                        // unset old transparent
-                        if(g->transparent >= 0)
-                        {
-                            g->pal[g->transparent][3] = 255;
-                        }
-                        if(g->eflags & 0x01)
-                        {
-                            g->transparent = stbi__get8(s);
-                            if(g->transparent >= 0)
-                            {
-                                g->pal[g->transparent][3] = 0;
-                            }
-                        }
-                        else
+        case 0x21: // Comment Extension.
+        {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9)
+            { // Graphic Control Extension.
+                len = stbi__get8(s);
+                if (len == 4)
+                {
+                    g->eflags = stbi__get8(s);
+                    g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                    // unset old transparent
+                    if (g->transparent >= 0)
+                    {
+                        g->pal[g->transparent][3] = 255;
+                    }
+                    if (g->eflags & 0x01)
+                    {
+                        g->transparent = stbi__get8(s);
+                        if (g->transparent >= 0)
                         {
-                            // don't need transparent
-                            stbi__skip(s, 1);
-                            g->transparent = -1;
+                            g->pal[g->transparent][3] = 0;
                         }
                     }
                     else
                     {
-                        stbi__skip(s, len);
-                        break;
+                        // don't need transparent
+                        stbi__skip(s, 1);
+                        g->transparent = -1;
                     }
                 }
-                while((len = stbi__get8(s)) != 0)
+                else
                 {
                     stbi__skip(s, len);
+                    break;
                 }
-                break;
             }
+            while ((len = stbi__get8(s)) != 0)
+            {
+                stbi__skip(s, len);
+            }
+            break;
+        }
 
-            case 0x3B:    // gif stream termination code
-                return ( stbi_uc* )s;    // using '1' causes warning on some compilers
+        case 0x3B:              // gif stream termination code
+            return (stbi_uc*)s; // using '1' causes warning on some compilers
 
-            default:
-                return stbi__errpuc("unknown code", "Corrupt GIF");
+        default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
         }
     }
 }
 
 static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y, int* z, int* comp, int req_comp)
 {
-    if(stbi__gif_test(s))
+    if (stbi__gif_test(s))
     {
         int layers = 0;
         stbi_uc* u = 0;
@@ -7418,7 +7413,7 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y,
         stbi__gif g;
         int stride;
         memset(&g, 0, sizeof(g));
-        if(delays)
+        if (delays)
         {
             *delays = 0;
         }
@@ -7426,44 +7421,44 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y,
         do
         {
             u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
-            if(u == ( stbi_uc* )s)
-                u = 0;    // end of animated gif marker
+            if (u == (stbi_uc*)s)
+                u = 0; // end of animated gif marker
 
-            if(u)
+            if (u)
             {
                 *x = g.w;
                 *y = g.h;
                 ++layers;
                 stride = g.w * g.h * 4;
 
-                if(out)
+                if (out)
                 {
-                    out = ( stbi_uc* )STBI_REALLOC(out, (size_t)layers * stride);
-                    if(delays)
+                    out = (stbi_uc*)STBI_REALLOC(out, (size_t)layers * stride);
+                    if (delays)
                     {
-                        *delays = ( int* )STBI_REALLOC(*delays, sizeof(int) * layers);
+                        *delays = (int*)STBI_REALLOC(*delays, sizeof(int) * layers);
                     }
                 }
                 else
                 {
-                    out = ( stbi_uc* )stbi__malloc((size_t)layers * stride);
-                    if(delays)
+                    out = (stbi_uc*)stbi__malloc((size_t)layers * stride);
+                    if (delays)
                     {
-                        *delays = ( int* )stbi__malloc(layers * sizeof(int));
+                        *delays = (int*)stbi__malloc(layers * sizeof(int));
                     }
                 }
                 memcpy(out + ((layers - 1) * stride), u, stride);
-                if(layers >= 2)
+                if (layers >= 2)
                 {
                     two_back = out - 2 * stride;
                 }
 
-                if(delays)
+                if (delays)
                 {
                     (*delays)[layers - 1U] = g.delay;
                 }
             }
-        } while(u != 0);
+        } while (u != 0);
 
         // free temp buffer;
         STBI_FREE(g.out);
@@ -7471,7 +7466,7 @@ static void* stbi__load_gif_main(stbi__context* s, int** delays, int* x, int* y,
         STBI_FREE(g.background);
 
         // do the final conversion after loading everything;
-        if(req_comp && req_comp != 4)
+        if (req_comp && req_comp != 4)
             out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
 
         *z = layers;
@@ -7490,16 +7485,16 @@ static void* stbi__gif_load(stbi__context* s, int* x, int* y, int* comp, int req
     memset(&g, 0, sizeof(g));
 
     u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
-    if(u == ( stbi_uc* )s)
-        u = 0;    // end of animated gif marker
-    if(u)
+    if (u == (stbi_uc*)s)
+        u = 0; // end of animated gif marker
+    if (u)
     {
         *x = g.w;
         *y = g.h;
 
         // moved conversion to after successful load so that the same
         // can be done for multiple frames.
-        if(req_comp && req_comp != 4)
+        if (req_comp && req_comp != 4)
             u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
     }
 
@@ -7523,8 +7518,8 @@ static int stbi__gif_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__hdr_test_core(stbi__context* s, const char* signature)
 {
     int i;
-    for(i = 0; signature[i]; ++i)
-        if(stbi__get8(s) != signature[i])
+    for (i = 0; signature[i]; ++i)
+        if (stbi__get8(s) != signature[i])
             return 0;
     stbi__rewind(s);
     return 1;
@@ -7534,7 +7529,7 @@ static int stbi__hdr_test(stbi__context* s)
 {
     int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
     stbi__rewind(s);
-    if(!r)
+    if (!r)
     {
         r = stbi__hdr_test_core(s, "#?RGBE\n");
         stbi__rewind(s);
@@ -7548,19 +7543,19 @@ static char* stbi__hdr_gettoken(stbi__context* z, char* buffer)
     int len = 0;
     char c = '\0';
 
-    c = ( char )stbi__get8(z);
+    c = (char)stbi__get8(z);
 
-    while(!stbi__at_eof(z) && c != '\n')
+    while (!stbi__at_eof(z) && c != '\n')
     {
         buffer[len++] = c;
-        if(len == STBI__HDR_BUFLEN - 1)
+        if (len == STBI__HDR_BUFLEN - 1)
         {
             // flush to end of line
-            while(!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
                 ;
             break;
         }
-        c = ( char )stbi__get8(z);
+        c = (char)stbi__get8(z);
     }
 
     buffer[len] = 0;
@@ -7569,12 +7564,12 @@ static char* stbi__hdr_gettoken(stbi__context* z, char* buffer)
 
 static void stbi__hdr_convert(float* output, stbi_uc* input, int req_comp)
 {
-    if(input[3] != 0)
+    if (input[3] != 0)
     {
         float f1;
         // Exponent
-        f1 = ( float )ldexp(1.0f, input[3] - ( int )(128 + 8));
-        if(req_comp <= 2)
+        f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8));
+        if (req_comp <= 2)
             output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
         else
         {
@@ -7582,25 +7577,25 @@ static void stbi__hdr_convert(float* output, stbi_uc* input, int req_comp)
             output[1] = input[1] * f1;
             output[2] = input[2] * f1;
         }
-        if(req_comp == 2)
+        if (req_comp == 2)
             output[1] = 1;
-        if(req_comp == 4)
+        if (req_comp == 4)
             output[3] = 1;
     }
     else
     {
-        switch(req_comp)
+        switch (req_comp)
         {
-            case 4:
-                output[3] = 1; /* fallthrough */
-            case 3:
-                output[0] = output[1] = output[2] = 0;
-                break;
-            case 2:
-                output[1] = 1; /* fallthrough */
-            case 1:
-                output[0] = 0;
-                break;
+        case 4:
+            output[3] = 1; /* fallthrough */
+        case 3:
+            output[0] = output[1] = output[2] = 0;
+            break;
+        case 2:
+            output[1] = 1; /* fallthrough */
+        case 1:
+            output[0] = 0;
+            break;
         }
     }
 }
@@ -7621,63 +7616,63 @@ static float* stbi__hdr_load(stbi__context* s, int* x, int* y, int* comp, int re
 
     // Check identifier
     headerToken = stbi__hdr_gettoken(s, buffer);
-    if(strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+    if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
         return stbi__errpf("not HDR", "Corrupt HDR image");
 
     // Parse header
-    for(;;)
+    for (;;)
     {
         token = stbi__hdr_gettoken(s, buffer);
-        if(token[0] == 0)
+        if (token[0] == 0)
             break;
-        if(strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
+        if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
             valid = 1;
     }
 
-    if(!valid)
+    if (!valid)
         return stbi__errpf("unsupported format", "Unsupported HDR format");
 
     // Parse width and height
     // can't use sscanf() if we're not using stdio!
     token = stbi__hdr_gettoken(s, buffer);
-    if(strncmp(token, "-Y ", 3))
+    if (strncmp(token, "-Y ", 3))
         return stbi__errpf("unsupported data layout", "Unsupported HDR format");
     token += 3;
-    height = ( int )strtol(token, &token, 10);
-    while(*token == ' ')
+    height = (int)strtol(token, &token, 10);
+    while (*token == ' ')
         ++token;
-    if(strncmp(token, "+X ", 3))
+    if (strncmp(token, "+X ", 3))
         return stbi__errpf("unsupported data layout", "Unsupported HDR format");
     token += 3;
-    width = ( int )strtol(token, NULL, 10);
+    width = (int)strtol(token, NULL, 10);
 
     *x = width;
     *y = height;
 
-    if(comp)
+    if (comp)
         *comp = 3;
-    if(req_comp == 0)
+    if (req_comp == 0)
         req_comp = 3;
 
-    if(!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+    if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
         return stbi__errpf("too large", "HDR image is too large");
 
     // Read data
-    hdr_data = ( float* )stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
-    if(!hdr_data)
+    hdr_data = (float*)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+    if (!hdr_data)
         return stbi__errpf("outofmem", "Out of memory");
 
     // Load image data
     // image data is stored as some number of sca
-    if(width < 8 || width >= 32768)
+    if (width < 8 || width >= 32768)
     {
         // Read flat data
-        for(j = 0; j < height; ++j)
+        for (j = 0; j < height; ++j)
         {
-            for(i = 0; i < width; ++i)
+            for (i = 0; i < width; ++i)
             {
                 stbi_uc rgbe[4];
-            main_decode_loop:
+main_decode_loop:
                 stbi__getn(s, rgbe, 4);
                 stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
             }
@@ -7688,83 +7683,83 @@ static float* stbi__hdr_load(stbi__context* s, int* x, int* y, int* comp, int re
         // Read RLE-encoded data
         scanline = NULL;
 
-        for(j = 0; j < height; ++j)
+        for (j = 0; j < height; ++j)
         {
             c1 = stbi__get8(s);
             c2 = stbi__get8(s);
             len = stbi__get8(s);
-            if(c1 != 2 || c2 != 2 || (len & 0x80))
+            if (c1 != 2 || c2 != 2 || (len & 0x80))
             {
                 // not run-length encoded, so we have to actually use THIS data as a decoded
                 // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
                 stbi_uc rgbe[4];
-                rgbe[0] = ( stbi_uc )c1;
-                rgbe[1] = ( stbi_uc )c2;
-                rgbe[2] = ( stbi_uc )len;
-                rgbe[3] = ( stbi_uc )stbi__get8(s);
+                rgbe[0] = (stbi_uc)c1;
+                rgbe[1] = (stbi_uc)c2;
+                rgbe[2] = (stbi_uc)len;
+                rgbe[3] = (stbi_uc)stbi__get8(s);
                 stbi__hdr_convert(hdr_data, rgbe, req_comp);
                 i = 1;
                 j = 0;
                 STBI_FREE(scanline);
-                goto main_decode_loop;    // yes, this makes no sense
+                goto main_decode_loop; // yes, this makes no sense
             }
             len <<= 8;
             len |= stbi__get8(s);
-            if(len != width)
+            if (len != width)
             {
                 STBI_FREE(hdr_data);
                 STBI_FREE(scanline);
                 return stbi__errpf("invalid decoded scanline length", "corrupt HDR");
             }
-            if(scanline == NULL)
+            if (scanline == NULL)
             {
-                scanline = ( stbi_uc* )stbi__malloc_mad2(width, 4, 0);
-                if(!scanline)
+                scanline = (stbi_uc*)stbi__malloc_mad2(width, 4, 0);
+                if (!scanline)
                 {
                     STBI_FREE(hdr_data);
                     return stbi__errpf("outofmem", "Out of memory");
                 }
             }
 
-            for(k = 0; k < 4; ++k)
+            for (k = 0; k < 4; ++k)
             {
                 int nleft;
                 i = 0;
-                while((nleft = width - i) > 0)
+                while ((nleft = width - i) > 0)
                 {
                     count = stbi__get8(s);
-                    if(count > 128)
+                    if (count > 128)
                     {
                         // Run
                         value = stbi__get8(s);
                         count -= 128;
-                        if(count > nleft)
+                        if (count > nleft)
                         {
                             STBI_FREE(hdr_data);
                             STBI_FREE(scanline);
                             return stbi__errpf("corrupt", "bad RLE data in HDR");
                         }
-                        for(z = 0; z < count; ++z)
+                        for (z = 0; z < count; ++z)
                             scanline[i++ * 4 + k] = value;
                     }
                     else
                     {
                         // Dump
-                        if(count > nleft)
+                        if (count > nleft)
                         {
                             STBI_FREE(hdr_data);
                             STBI_FREE(scanline);
                             return stbi__errpf("corrupt", "bad RLE data in HDR");
                         }
-                        for(z = 0; z < count; ++z)
+                        for (z = 0; z < count; ++z)
                             scanline[i++ * 4 + k] = stbi__get8(s);
                     }
                 }
             }
-            for(i = 0; i < width; ++i)
+            for (i = 0; i < width; ++i)
                 stbi__hdr_convert(hdr_data + (j * width + i) * req_comp, scanline + i * 4, req_comp);
         }
-        if(scanline)
+        if (scanline)
             STBI_FREE(scanline);
     }
 
@@ -7778,54 +7773,54 @@ static int stbi__hdr_info(stbi__context* s, int* x, int* y, int* comp)
     int valid = 0;
     int dummy;
 
-    if(!x)
+    if (!x)
         x = &dummy;
-    if(!y)
+    if (!y)
         y = &dummy;
-    if(!comp)
+    if (!comp)
         comp = &dummy;
 
-    if(stbi__hdr_test(s) == 0)
+    if (stbi__hdr_test(s) == 0)
     {
         stbi__rewind(s);
         return 0;
     }
 
-    for(;;)
+    for (;;)
     {
         token = stbi__hdr_gettoken(s, buffer);
-        if(token[0] == 0)
+        if (token[0] == 0)
             break;
-        if(strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
+        if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
             valid = 1;
     }
 
-    if(!valid)
+    if (!valid)
     {
         stbi__rewind(s);
         return 0;
     }
     token = stbi__hdr_gettoken(s, buffer);
-    if(strncmp(token, "-Y ", 3))
+    if (strncmp(token, "-Y ", 3))
     {
         stbi__rewind(s);
         return 0;
     }
     token += 3;
-    *y = ( int )strtol(token, &token, 10);
-    while(*token == ' ')
+    *y = (int)strtol(token, &token, 10);
+    while (*token == ' ')
         ++token;
-    if(strncmp(token, "+X ", 3))
+    if (strncmp(token, "+X ", 3))
     {
         stbi__rewind(s);
         return 0;
     }
     token += 3;
-    *x = ( int )strtol(token, NULL, 10);
+    *x = (int)strtol(token, NULL, 10);
     *comp = 3;
     return 1;
 }
-#endif    // STBI_NO_HDR
+#endif // STBI_NO_HDR
 
 #ifndef STBI_NO_BMP
 static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp)
@@ -7836,13 +7831,13 @@ static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp)
     info.all_a = 255;
     p = stbi__bmp_parse_header(s, &info);
     stbi__rewind(s);
-    if(p == NULL)
+    if (p == NULL)
         return 0;
-    if(x)
+    if (x)
         *x = s->img_x;
-    if(y)
+    if (y)
         *y = s->img_y;
-    if(comp)
+    if (comp)
         *comp = info.ma ? 4 : 3;
     return 1;
 }
@@ -7852,25 +7847,25 @@ static int stbi__bmp_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp)
 {
     int channelCount, dummy, depth;
-    if(!x)
+    if (!x)
         x = &dummy;
-    if(!y)
+    if (!y)
         y = &dummy;
-    if(!comp)
+    if (!comp)
         comp = &dummy;
-    if(stbi__get32be(s) != 0x38425053)
+    if (stbi__get32be(s) != 0x38425053)
     {
         stbi__rewind(s);
         return 0;
     }
-    if(stbi__get16be(s) != 1)
+    if (stbi__get16be(s) != 1)
     {
         stbi__rewind(s);
         return 0;
     }
     stbi__skip(s, 6);
     channelCount = stbi__get16be(s);
-    if(channelCount < 0 || channelCount > 16)
+    if (channelCount < 0 || channelCount > 16)
     {
         stbi__rewind(s);
         return 0;
@@ -7878,12 +7873,12 @@ static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp)
     *y = stbi__get32be(s);
     *x = stbi__get32be(s);
     depth = stbi__get16be(s);
-    if(depth != 8 && depth != 16)
+    if (depth != 8 && depth != 16)
     {
         stbi__rewind(s);
         return 0;
     }
-    if(stbi__get16be(s) != 3)
+    if (stbi__get16be(s) != 3)
     {
         stbi__rewind(s);
         return 0;
@@ -7895,27 +7890,27 @@ static int stbi__psd_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__psd_is16(stbi__context* s)
 {
     int channelCount, depth;
-    if(stbi__get32be(s) != 0x38425053)
+    if (stbi__get32be(s) != 0x38425053)
     {
         stbi__rewind(s);
         return 0;
     }
-    if(stbi__get16be(s) != 1)
+    if (stbi__get16be(s) != 1)
     {
         stbi__rewind(s);
         return 0;
     }
     stbi__skip(s, 6);
     channelCount = stbi__get16be(s);
-    if(channelCount < 0 || channelCount > 16)
+    if (channelCount < 0 || channelCount > 16)
     {
         stbi__rewind(s);
         return 0;
     }
-    ( void )stbi__get32be(s);
-    ( void )stbi__get32be(s);
+    (void)stbi__get32be(s);
+    (void)stbi__get32be(s);
     depth = stbi__get16be(s);
-    if(depth != 16)
+    if (depth != 16)
     {
         stbi__rewind(s);
         return 0;
@@ -7930,14 +7925,14 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
     int act_comp = 0, num_packets = 0, chained, dummy;
     stbi__pic_packet packets[10];
 
-    if(!x)
+    if (!x)
         x = &dummy;
-    if(!y)
+    if (!y)
         y = &dummy;
-    if(!comp)
+    if (!comp)
         comp = &dummy;
 
-    if(!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
+    if (!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
     {
         stbi__rewind(s);
         return 0;
@@ -7947,12 +7942,12 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
 
     *x = stbi__get16be(s);
     *y = stbi__get16be(s);
-    if(stbi__at_eof(s))
+    if (stbi__at_eof(s))
     {
         stbi__rewind(s);
         return 0;
     }
-    if((*x) != 0 && (1 << 28) / (*x) < (*y))
+    if ((*x) != 0 && (1 << 28) / (*x) < (*y))
     {
         stbi__rewind(s);
         return 0;
@@ -7964,7 +7959,7 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
     {
         stbi__pic_packet* packet;
 
-        if(num_packets == sizeof(packets) / sizeof(packets[0]))
+        if (num_packets == sizeof(packets) / sizeof(packets[0]))
             return 0;
 
         packet = &packets[num_packets++];
@@ -7974,17 +7969,17 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
         packet->channel = stbi__get8(s);
         act_comp |= packet->channel;
 
-        if(stbi__at_eof(s))
+        if (stbi__at_eof(s))
         {
             stbi__rewind(s);
             return 0;
         }
-        if(packet->size != 8)
+        if (packet->size != 8)
         {
             stbi__rewind(s);
             return 0;
         }
-    } while(chained);
+    } while (chained);
 
     *comp = (act_comp & 0x10 ? 4 : 3);
 
@@ -8009,9 +8004,9 @@ static int stbi__pic_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__pnm_test(stbi__context* s)
 {
     char p, t;
-    p = ( char )stbi__get8(s);
-    t = ( char )stbi__get8(s);
-    if(p != 'P' || (t != '5' && t != '6'))
+    p = (char)stbi__get8(s);
+    t = (char)stbi__get8(s);
+    if (p != 'P' || (t != '5' && t != '6'))
     {
         stbi__rewind(s);
         return 0;
@@ -8024,27 +8019,27 @@ static void* stbi__pnm_load(stbi__context* s, int* x, int* y, int* comp, int req
     stbi_uc* out;
     STBI_NOTUSED(ri);
 
-    if(!stbi__pnm_info(s, ( int* )&s->img_x, ( int* )&s->img_y, ( int* )&s->img_n))
+    if (!stbi__pnm_info(s, (int*)&s->img_x, (int*)&s->img_y, (int*)&s->img_n))
         return 0;
 
     *x = s->img_x;
     *y = s->img_y;
-    if(comp)
+    if (comp)
         *comp = s->img_n;
 
-    if(!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+    if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
         return stbi__errpuc("too large", "PNM too large");
 
-    out = ( stbi_uc* )stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
-    if(!out)
+    out = (stbi_uc*)stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
+    if (!out)
         return stbi__errpuc("outofmem", "Out of memory");
     stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
 
-    if(req_comp && req_comp != s->img_n)
+    if (req_comp && req_comp != s->img_n)
     {
         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
-        if(out == NULL)
-            return out;    // stbi__convert_format frees input on failure
+        if (out == NULL)
+            return out; // stbi__convert_format frees input on failure
     }
     return out;
 }
@@ -8056,16 +8051,16 @@ static int stbi__pnm_isspace(char c)
 
 static void stbi__pnm_skip_whitespace(stbi__context* s, char* c)
 {
-    for(;;)
+    for (;;)
     {
-        while(!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-            *c = ( char )stbi__get8(s);
+        while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+            *c = (char)stbi__get8(s);
 
-        if(stbi__at_eof(s) || *c != '#')
+        if (stbi__at_eof(s) || *c != '#')
             break;
 
-        while(!stbi__at_eof(s) && *c != '\n' && *c != '\r')
-            *c = ( char )stbi__get8(s);
+        while (!stbi__at_eof(s) && *c != '\n' && *c != '\r')
+            *c = (char)stbi__get8(s);
     }
 }
 
@@ -8078,10 +8073,10 @@ static int stbi__pnm_getinteger(stbi__context* s, char* c)
 {
     int value = 0;
 
-    while(!stbi__at_eof(s) && stbi__pnm_isdigit(*c))
+    while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c))
     {
         value = value * 10 + (*c - '0');
-        *c = ( char )stbi__get8(s);
+        *c = (char)stbi__get8(s);
     }
 
     return value;
@@ -8092,38 +8087,38 @@ static int stbi__pnm_info(stbi__context* s, int* x, int* y, int* comp)
     int maxv, dummy;
     char c, p, t;
 
-    if(!x)
+    if (!x)
         x = &dummy;
-    if(!y)
+    if (!y)
         y = &dummy;
-    if(!comp)
+    if (!comp)
         comp = &dummy;
 
     stbi__rewind(s);
 
     // Get identifier
-    p = ( char )stbi__get8(s);
-    t = ( char )stbi__get8(s);
-    if(p != 'P' || (t != '5' && t != '6'))
+    p = (char)stbi__get8(s);
+    t = (char)stbi__get8(s);
+    if (p != 'P' || (t != '5' && t != '6'))
     {
         stbi__rewind(s);
         return 0;
     }
 
-    *comp = (t == '6') ? 3 : 1;    // '5' is 1-component .pgm; '6' is 3-component .ppm
+    *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm
 
-    c = ( char )stbi__get8(s);
+    c = (char)stbi__get8(s);
     stbi__pnm_skip_whitespace(s, &c);
 
-    *x = stbi__pnm_getinteger(s, &c);    // read width
+    *x = stbi__pnm_getinteger(s, &c); // read width
     stbi__pnm_skip_whitespace(s, &c);
 
-    *y = stbi__pnm_getinteger(s, &c);    // read height
+    *y = stbi__pnm_getinteger(s, &c); // read height
     stbi__pnm_skip_whitespace(s, &c);
 
-    maxv = stbi__pnm_getinteger(s, &c);    // read max value
+    maxv = stbi__pnm_getinteger(s, &c); // read max value
 
-    if(maxv > 255)
+    if (maxv > 255)
         return stbi__err("max value > 255", "PPM image not 8-bit");
     else
         return 1;
@@ -8133,48 +8128,48 @@ static int stbi__pnm_info(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__info_main(stbi__context* s, int* x, int* y, int* comp)
 {
 #ifndef STBI_NO_JPEG
-    if(stbi__jpeg_info(s, x, y, comp))
+    if (stbi__jpeg_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_PNG
-    if(stbi__png_info(s, x, y, comp))
+    if (stbi__png_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_GIF
-    if(stbi__gif_info(s, x, y, comp))
+    if (stbi__gif_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_BMP
-    if(stbi__bmp_info(s, x, y, comp))
+    if (stbi__bmp_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_PSD
-    if(stbi__psd_info(s, x, y, comp))
+    if (stbi__psd_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_PIC
-    if(stbi__pic_info(s, x, y, comp))
+    if (stbi__pic_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_PNM
-    if(stbi__pnm_info(s, x, y, comp))
+    if (stbi__pnm_info(s, x, y, comp))
         return 1;
 #endif
 
 #ifndef STBI_NO_HDR
-    if(stbi__hdr_info(s, x, y, comp))
+    if (stbi__hdr_info(s, x, y, comp))
         return 1;
 #endif
 
 // test tga last because it's a crappy test!
 #ifndef STBI_NO_TGA
-    if(stbi__tga_info(s, x, y, comp))
+    if (stbi__tga_info(s, x, y, comp))
         return 1;
 #endif
     return stbi__err("unknown image type", "Image not of any known type, or corrupt");
@@ -8183,12 +8178,12 @@ static int stbi__info_main(stbi__context* s, int* x, int* y, int* comp)
 static int stbi__is_16_main(stbi__context* s)
 {
 #ifndef STBI_NO_PNG
-    if(stbi__png_is16(s))
+    if (stbi__png_is16(s))
         return 1;
 #endif
 
 #ifndef STBI_NO_PSD
-    if(stbi__psd_is16(s))
+    if (stbi__psd_is16(s))
         return 1;
 #endif
 
@@ -8200,7 +8195,7 @@ extern int stbi_info(char const* filename, int* x, int* y, int* comp)
 {
     FILE* f = stbi__fopen(filename, "rb");
     int result;
-    if(!f)
+    if (!f)
         return stbi__err("can't fopen", "Unable to open file");
     result = stbi_info_from_file(f, x, y, comp);
     fclose(f);
@@ -8222,7 +8217,7 @@ extern int stbi_is_16_bit(char const* filename)
 {
     FILE* f = stbi__fopen(filename, "rb");
     int result;
-    if(!f)
+    if (!f)
         return stbi__err("can't fopen", "Unable to open file");
     result = stbi_is_16_bit_from_file(f);
     fclose(f);
@@ -8239,7 +8234,7 @@ extern int stbi_is_16_bit_from_file(FILE* f)
     fseek(f, pos, SEEK_SET);
     return r;
 }
-#endif    // !STBI_NO_STDIO
+#endif // !STBI_NO_STDIO
 
 extern int stbi_info_from_memory(stbi_uc const* buffer, int len, int* x, int* y, int* comp)
 {
@@ -8251,7 +8246,7 @@ extern int stbi_info_from_memory(stbi_uc const* buffer, int len, int* x, int* y,
 extern int stbi_info_from_callbacks(stbi_io_callbacks const* c, void* user, int* x, int* y, int* comp)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )c, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)c, user);
     return stbi__info_main(&s, x, y, comp);
 }
 
@@ -8265,11 +8260,11 @@ extern int stbi_is_16_bit_from_memory(stbi_uc const* buffer, int len)
 extern int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const* c, void* user)
 {
     stbi__context s;
-    stbi__start_callbacks(&s, ( stbi_io_callbacks* )c, user);
+    stbi__start_callbacks(&s, (stbi_io_callbacks*)c, user);
     return stbi__is_16_main(&s);
 }
 
-#endif    // STB_IMAGE_IMPLEMENTATION
+#endif // STB_IMAGE_IMPLEMENTATION
 
 /*
    revision history:
diff --git a/tests/common/stb_image_write.h b/tests/common/stb_image_write.h
index 42b7c1796..fe585cf94 100644
--- a/tests/common/stb_image_write.h
+++ b/tests/common/stb_image_write.h
@@ -14,7 +14,7 @@
 #endif
 #endif
 
-#ifndef STB_IMAGE_WRITE_STATIC    // C++ forbids static forward declarations
+#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations
 extern int stbi_write_tga_with_rle;
 extern int stbi_write_png_compression_level;
 extern int stbi_write_force_png_filter;
@@ -40,7 +40,7 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x,
 
 STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 
-#endif    // INCLUDE_STB_IMAGE_WRITE_H
+#endif // INCLUDE_STB_IMAGE_WRITE_H
 
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
@@ -56,7 +56,7 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 
 #ifndef STBI_WRITE_NO_STDIO
 #include <stdio.h>
-#endif    // STBI_WRITE_NO_STDIO
+#endif // STBI_WRITE_NO_STDIO
 
 #include <stdarg.h>
 #include <stdlib.h>
@@ -72,9 +72,9 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 #endif
 
 #ifndef STBIW_MALLOC
-#define STBIW_MALLOC(sz) malloc(sz)
+#define STBIW_MALLOC(sz)        malloc(sz)
 #define STBIW_REALLOC(p, newsz) realloc(p, newsz)
-#define STBIW_FREE(p) free(p)
+#define STBIW_FREE(p)           free(p)
 #endif
 
 #ifndef STBIW_REALLOC_SIZED
@@ -90,7 +90,7 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 #define STBIW_ASSERT(x) assert(x)
 #endif
 
-#define STBIW_UCHAR(x) ( unsigned char )(( x )&0xff)
+#define STBIW_UCHAR(x) (unsigned char)((x)&0xff)
 
 #ifdef STB_IMAGE_WRITE_STATIC
 static int stbi__flip_vertically_on_write = 0;
@@ -126,69 +126,69 @@ static void stbi__start_write_callbacks(stbi__write_context* s, stbi_write_func*
 
 static void stbi__stdio_write(void* context, void* data, int size)
 {
-    fwrite(data, 1, size, ( FILE* )context);
+    fwrite(data, 1, size, (FILE*)context);
 }
 
 static int stbi__start_write_file(stbi__write_context* s, const char* filename)
 {
     FILE* f;
 #ifdef STBI_MSC_SECURE_CRT
-    if(fopen_s(&f, filename, "wb"))
+    if (fopen_s(&f, filename, "wb"))
         f = NULL;
 #else
     f = fopen(filename, "wb");
 #endif
-    stbi__start_write_callbacks(s, stbi__stdio_write, ( void* )f);
+    stbi__start_write_callbacks(s, stbi__stdio_write, (void*)f);
     return f != NULL;
 }
 
 static void stbi__end_write_file(stbi__write_context* s)
 {
-    fclose(( FILE* )s->context);
+    fclose((FILE*)s->context);
 }
 
-#endif    // !STBI_WRITE_NO_STDIO
+#endif // !STBI_WRITE_NO_STDIO
 
 typedef unsigned int stbiw_uint32;
 typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1];
 
 static void stbiw__writefv(stbi__write_context* s, const char* fmt, va_list v)
 {
-    while(*fmt)
+    while (*fmt)
     {
-        switch(*fmt++)
+        switch (*fmt++)
         {
-            case ' ':
-                break;
-            case '1':
-            {
-                unsigned char x = STBIW_UCHAR(va_arg(v, int));
-                s->func(s->context, &x, 1);
-                break;
-            }
-            case '2':
-            {
-                int x = va_arg(v, int);
-                unsigned char b[2];
-                b[0] = STBIW_UCHAR(x);
-                b[1] = STBIW_UCHAR(x >> 8);
-                s->func(s->context, b, 2);
-                break;
-            }
-            case '4':
-            {
-                stbiw_uint32 x = va_arg(v, int);
-                unsigned char b[4];
-                b[0] = STBIW_UCHAR(x);
-                b[1] = STBIW_UCHAR(x >> 8);
-                b[2] = STBIW_UCHAR(x >> 16);
-                b[3] = STBIW_UCHAR(x >> 24);
-                s->func(s->context, b, 4);
-                break;
-            }
-            default:
-                STBIW_ASSERT(0);
-                return;
+        case ' ':
+            break;
+        case '1':
+        {
+            unsigned char x = STBIW_UCHAR(va_arg(v, int));
+            s->func(s->context, &x, 1);
+            break;
+        }
+        case '2':
+        {
+            int x = va_arg(v, int);
+            unsigned char b[2];
+            b[0] = STBIW_UCHAR(x);
+            b[1] = STBIW_UCHAR(x >> 8);
+            s->func(s->context, b, 2);
+            break;
+        }
+        case '4':
+        {
+            stbiw_uint32 x = va_arg(v, int);
+            unsigned char b[4];
+            b[0] = STBIW_UCHAR(x);
+            b[1] = STBIW_UCHAR(x >> 8);
+            b[2] = STBIW_UCHAR(x >> 16);
+            b[3] = STBIW_UCHAR(x >> 24);
+            s->func(s->context, b, 4);
+            break;
+        }
+        default:
+            STBIW_ASSERT(0);
+            return;
         }
     }
 }
@@ -219,33 +219,33 @@ static void stbiw__write_pixel(stbi__write_context* s, int rgb_dir, int comp, in
     unsigned char bg[3] = {255, 0, 255}, px[3];
     int k;
 
-    if(write_alpha < 0)
+    if (write_alpha < 0)
         s->func(s->context, &d[comp - 1], 1);
 
-    switch(comp)
+    switch (comp)
     {
-        case 2:    // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
-        case 1:
-            if(expand_mono)
-                stbiw__write3(s, d[0], d[0], d[0]);    // monochrome bmp
-            else
-                s->func(s->context, d, 1);    // monochrome TGA
-            break;
-        case 4:
-            if(!write_alpha)
-            {
-                // composite against pink background
-                for(k = 0; k < 3; ++k)
-                    px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
-                stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
-                break;
-            }
-            /* FALLTHROUGH */
-        case 3:
-            stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+    case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+    case 1:
+        if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+        else
+            s->func(s->context, d, 1); // monochrome TGA
+        break;
+    case 4:
+        if (!write_alpha)
+        {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+                px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
             break;
+        }
+        /* FALLTHROUGH */
+    case 3:
+        stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+        break;
     }
-    if(write_alpha > 0)
+    if (write_alpha > 0)
         s->func(s->context, &d[comp - 1], 1);
 }
 
@@ -255,22 +255,22 @@ static void stbiw__write_pixels(stbi__write_context* s, int rgb_dir, int vdir, i
     stbiw_uint32 zero = 0;
     int i, j, j_end;
 
-    if(y <= 0)
+    if (y <= 0)
         return;
 
-    if(stbi__flip_vertically_on_write)
+    if (stbi__flip_vertically_on_write)
         vdir *= -1;
 
-    if(vdir < 0)
+    if (vdir < 0)
         j_end = -1, j = y - 1;
     else
         j_end = y, j = 0;
 
-    for(; j != j_end; j += vdir)
+    for (; j != j_end; j += vdir)
     {
-        for(i = 0; i < x; ++i)
+        for (i = 0; i < x; ++i)
         {
-            unsigned char* d = ( unsigned char* )data + (j * x + i) * comp;
+            unsigned char* d = (unsigned char*)data + (j * x + i) * comp;
             stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
         }
         s->func(s->context, &zero, scanline_pad);
@@ -280,7 +280,7 @@ static void stbiw__write_pixels(stbi__write_context* s, int rgb_dir, int vdir, i
 static int stbiw__outfile(stbi__write_context* s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono,
                           void* data, int alpha, int pad, const char* fmt, ...)
 {
-    if(y < 0 || x < 0)
+    if (y < 0 || x < 0)
     {
         return 0;
     }
@@ -298,11 +298,11 @@ static int stbiw__outfile(stbi__write_context* s, int rgb_dir, int vdir, int x,
 static int stbi_write_bmp_core(stbi__write_context* s, int x, int y, int comp, const void* data)
 {
     int pad = (-x * 3) & 3;
-    return stbiw__outfile(s, -1, -1, x, y, comp, 1, ( void* )data, 0, pad,
+    return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void*)data, 0, pad,
                           "11 4 22 4"
                           "4 44 22 444444",
-                          'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, 14 + 40,    // file header
-                          40, x, y, 1, 24, 0, 0, 0, 0, 0, 0);    // bitmap header
+                          'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, 14 + 40, // file header
+                          40, x, y, 1, 24, 0, 0, 0, 0, 0, 0);                   // bitmap header
 }
 
 STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int x, int y, int comp, const void* data)
@@ -316,7 +316,7 @@ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int x,
 STBIWDEF int stbi_write_bmp(char const* filename, int x, int y, int comp, const void* data)
 {
     stbi__write_context s;
-    if(stbi__start_write_file(&s, filename))
+    if (stbi__start_write_file(&s, filename))
     {
         int r = stbi_write_bmp_core(&s, x, y, comp, data);
         stbi__end_write_file(&s);
@@ -325,20 +325,20 @@ STBIWDEF int stbi_write_bmp(char const* filename, int x, int y, int comp, const
     else
         return 0;
 }
-#endif    //! STBI_WRITE_NO_STDIO
+#endif //! STBI_WRITE_NO_STDIO
 
 static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, void* data)
 {
     int has_alpha = (comp == 2 || comp == 4);
     int colorbytes = has_alpha ? comp - 1 : comp;
-    int format = colorbytes < 2 ? 3 : 2;    // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+    int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
 
-    if(y < 0 || x < 0)
+    if (y < 0 || x < 0)
         return 0;
 
-    if(!stbi_write_tga_with_rle)
+    if (!stbi_write_tga_with_rle)
     {
-        return stbiw__outfile(s, -1, -1, x, y, comp, 0, ( void* )data, has_alpha, 0, "111 221 2222 11", 0, 0, format, 0,
+        return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void*)data, has_alpha, 0, "111 221 2222 11", 0, 0, format, 0,
                               0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
     }
     else
@@ -349,7 +349,7 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v
         stbiw__writef(s, "111 221 2222 11", 0, 0, format + 8, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8,
                       has_alpha * 8);
 
-        if(stbi__flip_vertically_on_write)
+        if (stbi__flip_vertically_on_write)
         {
             j = 0;
             jend = y;
@@ -361,27 +361,27 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v
             jend = -1;
             jdir = -1;
         }
-        for(; j != jend; j += jdir)
+        for (; j != jend; j += jdir)
         {
-            unsigned char* row = ( unsigned char* )data + j * x * comp;
+            unsigned char* row = (unsigned char*)data + j * x * comp;
             int len;
 
-            for(i = 0; i < x; i += len)
+            for (i = 0; i < x; i += len)
             {
                 unsigned char* begin = row + i * comp;
                 int diff = 1;
                 len = 1;
 
-                if(i < x - 1)
+                if (i < x - 1)
                 {
                     ++len;
                     diff = memcmp(begin, row + (i + 1) * comp, comp);
-                    if(diff)
+                    if (diff)
                     {
                         const unsigned char* prev = begin;
-                        for(k = i + 2; k < x && len < 128; ++k)
+                        for (k = i + 2; k < x && len < 128; ++k)
                         {
-                            if(memcmp(prev, row + k * comp, comp))
+                            if (memcmp(prev, row + k * comp, comp))
                             {
                                 prev += comp;
                                 ++len;
@@ -395,9 +395,9 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v
                     }
                     else
                     {
-                        for(k = i + 2; k < x && len < 128; ++k)
+                        for (k = i + 2; k < x && len < 128; ++k)
                         {
-                            if(!memcmp(begin, row + k * comp, comp))
+                            if (!memcmp(begin, row + k * comp, comp))
                             {
                                 ++len;
                             }
@@ -409,11 +409,11 @@ static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, v
                     }
                 }
 
-                if(diff)
+                if (diff)
                 {
                     unsigned char header = STBIW_UCHAR(len - 1);
                     s->func(s->context, &header, 1);
-                    for(k = 0; k < len; ++k)
+                    for (k = 0; k < len; ++k)
                     {
                         stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
                     }
@@ -434,16 +434,16 @@ STBIWDEF int stbi_write_tga_to_func(stbi_write_func* func, void* context, int x,
 {
     stbi__write_context s;
     stbi__start_write_callbacks(&s, func, context);
-    return stbi_write_tga_core(&s, x, y, comp, ( void* )data);
+    return stbi_write_tga_core(&s, x, y, comp, (void*)data);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_tga(char const* filename, int x, int y, int comp, const void* data)
 {
     stbi__write_context s;
-    if(stbi__start_write_file(&s, filename))
+    if (stbi__start_write_file(&s, filename))
     {
-        int r = stbi_write_tga_core(&s, x, y, comp, ( void* )data);
+        int r = stbi_write_tga_core(&s, x, y, comp, (void*)data);
         stbi__end_write_file(&s);
         return r;
     }
@@ -463,18 +463,18 @@ void stbiw__linear_to_rgbe(unsigned char* rgbe, float* linear)
     int exponent;
     float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
 
-    if(maxcomp < 1e-32f)
+    if (maxcomp < 1e-32f)
     {
         rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
     }
     else
     {
-        float normalize = ( float )frexp(maxcomp, &exponent) * 256.0f / maxcomp;
+        float normalize = (float)frexp(maxcomp, &exponent) * 256.0f / maxcomp;
 
-        rgbe[0] = ( unsigned char )(linear[0] * normalize);
-        rgbe[1] = ( unsigned char )(linear[1] * normalize);
-        rgbe[2] = ( unsigned char )(linear[2] * normalize);
-        rgbe[3] = ( unsigned char )(exponent + 128);
+        rgbe[0] = (unsigned char)(linear[0] * normalize);
+        rgbe[1] = (unsigned char)(linear[1] * normalize);
+        rgbe[2] = (unsigned char)(linear[2] * normalize);
+        rgbe[3] = (unsigned char)(exponent + 128);
     }
 }
 
@@ -489,7 +489,7 @@ void stbiw__write_run_data(stbi__write_context* s, int length, unsigned char dat
 void stbiw__write_dump_data(stbi__write_context* s, int length, unsigned char* data)
 {
     unsigned char lengthbyte = STBIW_UCHAR(length);
-    STBIW_ASSERT(length <= 128);    // inconsistent with spec but consistent with official code
+    STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
     s->func(s->context, &lengthbyte, 1);
     s->func(s->context, data, length);
 }
@@ -505,21 +505,21 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns
     scanlineheader[3] = (width & 0x00ff);
 
     /* skip RLE for images too small or large */
-    if(width < 8 || width >= 32768)
+    if (width < 8 || width >= 32768)
     {
-        for(x = 0; x < width; x++)
+        for (x = 0; x < width; x++)
         {
-            switch(ncomp)
+            switch (ncomp)
             {
-                case 4: /* fallthrough */
-                case 3:
-                    linear[2] = scanline[x * ncomp + 2];
-                    linear[1] = scanline[x * ncomp + 1];
-                    linear[0] = scanline[x * ncomp + 0];
-                    break;
-                default:
-                    linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
-                    break;
+            case 4: /* fallthrough */
+            case 3:
+                linear[2] = scanline[x * ncomp + 2];
+                linear[1] = scanline[x * ncomp + 1];
+                linear[0] = scanline[x * ncomp + 0];
+                break;
+            default:
+                linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
+                break;
             }
             stbiw__linear_to_rgbe(rgbe, linear);
             s->func(s->context, rgbe, 4);
@@ -529,19 +529,19 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns
     {
         int c, r;
         /* encode into scratch buffer */
-        for(x = 0; x < width; x++)
+        for (x = 0; x < width; x++)
         {
-            switch(ncomp)
+            switch (ncomp)
             {
-                case 4: /* fallthrough */
-                case 3:
-                    linear[2] = scanline[x * ncomp + 2];
-                    linear[1] = scanline[x * ncomp + 1];
-                    linear[0] = scanline[x * ncomp + 0];
-                    break;
-                default:
-                    linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
-                    break;
+            case 4: /* fallthrough */
+            case 3:
+                linear[2] = scanline[x * ncomp + 2];
+                linear[1] = scanline[x * ncomp + 1];
+                linear[0] = scanline[x * ncomp + 0];
+                break;
+            default:
+                linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
+                break;
             }
             stbiw__linear_to_rgbe(rgbe, linear);
             scratch[x + width * 0] = rgbe[0];
@@ -553,43 +553,43 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns
         s->func(s->context, scanlineheader, 4);
 
         /* RLE each component separately */
-        for(c = 0; c < 4; c++)
+        for (c = 0; c < 4; c++)
         {
             unsigned char* comp = &scratch[width * c];
 
             x = 0;
-            while(x < width)
+            while (x < width)
             {
                 // find first run
                 r = x;
-                while(r + 2 < width)
+                while (r + 2 < width)
                 {
-                    if(comp[r] == comp[r + 1] && comp[r] == comp[r + 2])
+                    if (comp[r] == comp[r + 1] && comp[r] == comp[r + 2])
                         break;
                     ++r;
                 }
-                if(r + 2 >= width)
+                if (r + 2 >= width)
                     r = width;
                 // dump up to first run
-                while(x < r)
+                while (x < r)
                 {
                     int len = r - x;
-                    if(len > 128)
+                    if (len > 128)
                         len = 128;
                     stbiw__write_dump_data(s, len, &comp[x]);
                     x += len;
                 }
                 // if there's a run, output it
-                if(r + 2 < width)
-                {    // same test as what we break out of in search loop, so only true if we break'd
+                if (r + 2 < width)
+                { // same test as what we break out of in search loop, so only true if we break'd
                     // find next byte after run
-                    while(r < width && comp[r] == comp[x])
+                    while (r < width && comp[r] == comp[x])
                         ++r;
                     // output run up to r
-                    while(x < r)
+                    while (x < r)
                     {
                         int len = r - x;
-                        if(len > 127)
+                        if (len > 127)
                             len = 127;
                         stbiw__write_run_data(s, len, comp[x]);
                         x += len;
@@ -602,12 +602,12 @@ void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp, uns
 
 static int stbi_write_hdr_core(stbi__write_context* s, int x, int y, int comp, float* data)
 {
-    if(y <= 0 || x <= 0 || data == NULL)
+    if (y <= 0 || x <= 0 || data == NULL)
         return 0;
     else
     {
         // Each component is stored separately. Allocate scratch space for full output scanline.
-        unsigned char* scratch = ( unsigned char* )STBIW_MALLOC(x * 4);
+        unsigned char* scratch = (unsigned char*)STBIW_MALLOC(x * 4);
         int i, len;
         char buffer[128];
         char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
@@ -620,7 +620,7 @@ static int stbi_write_hdr_core(stbi__write_context* s, int x, int y, int comp, f
 #endif
         s->func(s->context, buffer, len);
 
-        for(i = 0; i < y; i++)
+        for (i = 0; i < y; i++)
             stbiw__write_hdr_scanline(s, x, comp, scratch,
                                       data + comp * x * (stbi__flip_vertically_on_write ? y - 1 - i : i) * x);
         STBIW_FREE(scratch);
@@ -632,23 +632,23 @@ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func* func, void* context, int x,
 {
     stbi__write_context s;
     stbi__start_write_callbacks(&s, func, context);
-    return stbi_write_hdr_core(&s, x, y, comp, ( float* )data);
+    return stbi_write_hdr_core(&s, x, y, comp, (float*)data);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_hdr(char const* filename, int x, int y, int comp, const float* data)
 {
     stbi__write_context s;
-    if(stbi__start_write_file(&s, filename))
+    if (stbi__start_write_file(&s, filename))
     {
-        int r = stbi_write_hdr_core(&s, x, y, comp, ( float* )data);
+        int r = stbi_write_hdr_core(&s, x, y, comp, (float*)data);
         stbi__end_write_file(&s);
         return r;
     }
     else
         return 0;
 }
-#endif    // STBI_WRITE_NO_STDIO
+#endif // STBI_WRITE_NO_STDIO
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -657,30 +657,29 @@ STBIWDEF int stbi_write_hdr(char const* filename, int x, int y, int comp, const
 
 #ifndef STBIW_ZLIB_COMPRESS
 // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
-#define stbiw__sbraw(a) (( int* )( a )-2)
-#define stbiw__sbm(a) stbiw__sbraw(a)[0]
-#define stbiw__sbn(a) stbiw__sbraw(a)[1]
+#define stbiw__sbraw(a) ((int*)(a)-2)
+#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
+#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
 
-#define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a))
+#define stbiw__sbneedgrow(a, n)  ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a))
 #define stbiw__sbmaybegrow(a, n) (stbiw__sbneedgrow(a, (n)) ? stbiw__sbgrow(a, n) : 0)
-#define stbiw__sbgrow(a, n) stbiw__sbgrowf(( void** )&(a), (n), sizeof(*(a)))
+#define stbiw__sbgrow(a, n)      stbiw__sbgrowf((void**)&(a), (n), sizeof(*(a)))
 
 #define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a, 1), (a)[stbiw__sbn(a)++] = (v))
-#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0)
-#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0)
+#define stbiw__sbcount(a)   ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a)    ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0)
 
 static void* stbiw__sbgrowf(void** arr, int increment, int itemsize)
 {
     int m = *arr ? 2 * stbiw__sbm(*arr) + increment : increment + 1;
-    void* p =
-        STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0,
-                            (unsigned long)itemsize * m + sizeof(int) * 2);
+    void* p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0,
+                                  (unsigned long)itemsize * m + sizeof(int) * 2);
     STBIW_ASSERT(p);
-    if(p)
+    if (p)
     {
-        if(!*arr)
-            (( int* )p)[1] = 0;
-        *arr = ( void* )(( int* )p + 2);
+        if (!*arr)
+            ((int*)p)[1] = 0;
+        *arr = (void*)((int*)p + 2);
         stbiw__sbm(*arr) = m;
     }
     return *arr;
@@ -688,7 +687,7 @@ static void* stbiw__sbgrowf(void** arr, int increment, int itemsize)
 
 static unsigned char* stbiw__zlib_flushf(unsigned char* data, unsigned int* bitbuffer, int* bitcount)
 {
-    while(*bitcount >= 8)
+    while (*bitcount >= 8)
     {
         stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
         *bitbuffer >>= 8;
@@ -700,7 +699,7 @@ static unsigned char* stbiw__zlib_flushf(unsigned char* data, unsigned int* bitb
 static int stbiw__zlib_bitrev(int code, int codebits)
 {
     int res = 0;
-    while(codebits--)
+    while (codebits--)
     {
         res = (res << 1) | (code & 1);
         code >>= 1;
@@ -711,8 +710,8 @@ static int stbiw__zlib_bitrev(int code, int codebits)
 static unsigned int stbiw__zlib_countm(unsigned char* a, unsigned char* b, int limit)
 {
     int i;
-    for(i = 0; i < limit && i < 258; ++i)
-        if(a[i] != b[i])
+    for (i = 0; i < limit && i < 258; ++i)
+        if (a[i] != b[i])
             break;
     return i;
 }
@@ -729,93 +728,94 @@ static unsigned int stbiw__zhash(unsigned char* data)
     return hash;
 }
 
-#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
+#define stbiw__zlib_flush()             (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
 #define stbiw__zlib_add(code, codebits) (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
-#define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c)
+#define stbiw__zlib_huffa(b, c)         stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c)
 // default huffman tables
 #define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8)
-#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + ( n )-144, 9)
-#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + ( n )-256, 7)
-#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + ( n )-280, 8)
-#define stbiw__zlib_huff(n)              \
-    ((n) <= 143 ? stbiw__zlib_huff1(n) : \
-                  (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256, 7)
+#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280, 8)
+#define stbiw__zlib_huff(n)                                                \
+    ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) \
+                                     : (n) <= 279   ? stbiw__zlib_huff3(n) \
+                                                    : stbiw__zlib_huff4(n))
 #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
 
 #define stbiw__ZHASH 16384
 
-#endif    // STBIW_ZLIB_COMPRESS
+#endif // STBIW_ZLIB_COMPRESS
 
 unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_len, int quality)
 {
 #ifdef STBIW_ZLIB_COMPRESS
     // user provided a zlib compress implementation, use that
     return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
-#else    // use builtin
-    static unsigned short lengthc[] = {3,  4,  5,  6,  7,  8,  9,  10, 11,  13,  15,  17,  19,  23,  27,
+#else  // use builtin
+    static unsigned short lengthc[] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27,
                                        31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259};
     static unsigned char lengtheb[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
                                        2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
-    static unsigned short distc[] = {1,    2,    3,    4,    5,    7,     9,     13,    17,   25,   33,
-                                     49,   65,   97,   129,  193,  257,   385,   513,   769,  1025, 1537,
+    static unsigned short distc[] = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33,
+                                     49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537,
                                      2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768};
-    static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2,  3,  3,  4,  4,  5,  5,  6,
+    static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
                                      6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
     unsigned int bitbuf = 0;
     int i, j, bitcount = 0;
     unsigned char* out = NULL;
-    unsigned char*** hash_table = ( unsigned char*** )STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
-    if(hash_table == NULL)
+    unsigned char*** hash_table = (unsigned char***)STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
+    if (hash_table == NULL)
         return NULL;
-    if(quality < 5)
+    if (quality < 5)
         quality = 5;
 
-    stbiw__sbpush(out, 0x78);    // DEFLATE 32K window
-    stbiw__sbpush(out, 0x5e);    // FLEVEL = 1
+    stbiw__sbpush(out, 0x78); // DEFLATE 32K window
+    stbiw__sbpush(out, 0x5e); // FLEVEL = 1
     stbiw__zlib_add(1, 1);    // BFINAL = 1
     stbiw__zlib_add(1, 2);    // BTYPE = 1 -- fixed huffman
 
-    for(i = 0; i < stbiw__ZHASH; ++i)
+    for (i = 0; i < stbiw__ZHASH; ++i)
         hash_table[i] = NULL;
 
     i = 0;
-    while(i < data_len - 3)
+    while (i < data_len - 3)
     {
         // hash next 3 bytes of data to be compressed
         int h = stbiw__zhash(data + i) & (stbiw__ZHASH - 1), best = 3;
         unsigned char* bestloc = 0;
         unsigned char** hlist = hash_table[h];
         int n = stbiw__sbcount(hlist);
-        for(j = 0; j < n; ++j)
+        for (j = 0; j < n; ++j)
         {
-            if(hlist[j] - data > i - 32768)
-            {    // if entry lies within window
+            if (hlist[j] - data > i - 32768)
+            { // if entry lies within window
                 int d = stbiw__zlib_countm(hlist[j], data + i, data_len - i);
-                if(d >= best)
+                if (d >= best)
                     best = d, bestloc = hlist[j];
             }
         }
         // when hash table entry is too long, delete half the entries
-        if(hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality)
+        if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality)
         {
             STBIW_MEMMOVE(hash_table[h], hash_table[h] + quality, sizeof(hash_table[h][0]) * quality);
             stbiw__sbn(hash_table[h]) = quality;
         }
         stbiw__sbpush(hash_table[h], data + i);
 
-        if(bestloc)
+        if (bestloc)
         {
             // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
             h = stbiw__zhash(data + i + 1) & (stbiw__ZHASH - 1);
             hlist = hash_table[h];
             n = stbiw__sbcount(hlist);
-            for(j = 0; j < n; ++j)
+            for (j = 0; j < n; ++j)
             {
-                if(hlist[j] - data > i - 32767)
+                if (hlist[j] - data > i - 32767)
                 {
                     int e = stbiw__zlib_countm(hlist[j], data + i + 1, data_len - i - 1);
-                    if(e > best)
-                    {    // if next match is better, bail on current match
+                    if (e > best)
+                    { // if next match is better, bail on current match
                         bestloc = NULL;
                         break;
                     }
@@ -823,19 +823,19 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le
             }
         }
 
-        if(bestloc)
+        if (bestloc)
         {
-            int d = ( int )(data + i - bestloc);    // distance back
+            int d = (int)(data + i - bestloc); // distance back
             STBIW_ASSERT(d <= 32767 && best <= 258);
-            for(j = 0; best > lengthc[j + 1] - 1; ++j)
+            for (j = 0; best > lengthc[j + 1] - 1; ++j)
                 ;
             stbiw__zlib_huff(j + 257);
-            if(lengtheb[j])
+            if (lengtheb[j])
                 stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
-            for(j = 0; d > distc[j + 1] - 1; ++j)
+            for (j = 0; d > distc[j + 1] - 1; ++j)
                 ;
             stbiw__zlib_add(stbiw__zlib_bitrev(j, 5), 5);
-            if(disteb[j])
+            if (disteb[j])
                 stbiw__zlib_add(d - distc[j], disteb[j]);
             i += best;
         }
@@ -846,25 +846,25 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le
         }
     }
     // write out final bytes
-    for(; i < data_len; ++i)
+    for (; i < data_len; ++i)
         stbiw__zlib_huffb(data[i]);
-    stbiw__zlib_huff(256);    // end of block
+    stbiw__zlib_huff(256); // end of block
     // pad with 0 bits to byte boundary
-    while(bitcount)
+    while (bitcount)
         stbiw__zlib_add(0, 1);
 
-    for(i = 0; i < stbiw__ZHASH; ++i)
-        ( void )stbiw__sbfree(hash_table[i]);
+    for (i = 0; i < stbiw__ZHASH; ++i)
+        (void)stbiw__sbfree(hash_table[i]);
     STBIW_FREE(hash_table);
 
     {
         // compute adler32 on input
         unsigned int s1 = 1, s2 = 0;
-        int blocklen = ( int )(data_len % 5552);
+        int blocklen = (int)(data_len % 5552);
         j = 0;
-        while(j < data_len)
+        while (j < data_len)
         {
-            for(i = 0; i < blocklen; ++i)
+            for (i = 0; i < blocklen; ++i)
                 s1 += data[j + i], s2 += s1;
             s1 %= 65521, s2 %= 65521;
             j += blocklen;
@@ -878,8 +878,8 @@ unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_le
     *out_len = stbiw__sbn(out);
     // make returned pointer freeable
     STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
-    return ( unsigned char* )stbiw__sbraw(out);
-#endif    // STBIW_ZLIB_COMPRESS
+    return (unsigned char*)stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
 }
 
 static unsigned int stbiw__crc32(unsigned char* buffer, int len)
@@ -917,14 +917,14 @@ static unsigned int stbiw__crc32(unsigned char* buffer, int len)
 
     unsigned int crc = ~0u;
     int i;
-    for(i = 0; i < len; ++i)
+    for (i = 0; i < len; ++i)
         crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
     return ~crc;
 }
 
 #define stbiw__wpng4(o, a, b, c, d) \
     ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c), (o)[3] = STBIW_UCHAR(d), (o) += 4)
-#define stbiw__wp32(data, v) stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v));
+#define stbiw__wp32(data, v)  stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v));
 #define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3])
 
 static void stbiw__wpcrc(unsigned char** data, int len)
@@ -936,9 +936,9 @@ static void stbiw__wpcrc(unsigned char** data, int len)
 static unsigned char stbiw__paeth(int a, int b, int c)
 {
     int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
-    if(pa <= pb && pa <= pc)
+    if (pa <= pb && pa <= pc)
         return STBIW_UCHAR(a);
-    if(pb <= pc)
+    if (pb <= pc)
         return STBIW_UCHAR(b);
     return STBIW_UCHAR(c);
 }
@@ -954,58 +954,58 @@ static void stbiw__encode_png_line(unsigned char* pixels, int stride_bytes, int
     int type = mymap[filter_type];
     unsigned char* z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y);
     int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
-    for(i = 0; i < n; ++i)
+    for (i = 0; i < n; ++i)
     {
-        switch(type)
+        switch (type)
         {
-            case 0:
-                line_buffer[i] = z[i];
-                break;
-            case 1:
-                line_buffer[i] = z[i];
-                break;
-            case 2:
-                line_buffer[i] = z[i] - z[i - signed_stride];
-                break;
-            case 3:
-                line_buffer[i] = z[i] - (z[i - signed_stride] >> 1);
-                break;
-            case 4:
-                line_buffer[i] = ( signed char )(z[i] - stbiw__paeth(0, z[i - signed_stride], 0));
-                break;
-            case 5:
-                line_buffer[i] = z[i];
-                break;
-            case 6:
-                line_buffer[i] = z[i];
-                break;
+        case 0:
+            line_buffer[i] = z[i];
+            break;
+        case 1:
+            line_buffer[i] = z[i];
+            break;
+        case 2:
+            line_buffer[i] = z[i] - z[i - signed_stride];
+            break;
+        case 3:
+            line_buffer[i] = z[i] - (z[i - signed_stride] >> 1);
+            break;
+        case 4:
+            line_buffer[i] = (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0));
+            break;
+        case 5:
+            line_buffer[i] = z[i];
+            break;
+        case 6:
+            line_buffer[i] = z[i];
+            break;
         }
     }
-    for(i = n; i < width * n; ++i)
+    for (i = n; i < width * n; ++i)
     {
-        switch(type)
+        switch (type)
         {
-            case 0:
-                line_buffer[i] = z[i];
-                break;
-            case 1:
-                line_buffer[i] = z[i] - z[i - n];
-                break;
-            case 2:
-                line_buffer[i] = z[i] - z[i - signed_stride];
-                break;
-            case 3:
-                line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1);
-                break;
-            case 4:
-                line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]);
-                break;
-            case 5:
-                line_buffer[i] = z[i] - (z[i - n] >> 1);
-                break;
-            case 6:
-                line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0);
-                break;
+        case 0:
+            line_buffer[i] = z[i];
+            break;
+        case 1:
+            line_buffer[i] = z[i] - z[i - n];
+            break;
+        case 2:
+            line_buffer[i] = z[i] - z[i - signed_stride];
+            break;
+        case 3:
+            line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1);
+            break;
+        case 4:
+            line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]);
+            break;
+        case 5:
+            line_buffer[i] = z[i] - (z[i - n] >> 1);
+            break;
+        case 6:
+            line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0);
+            break;
         }
     }
 }
@@ -1019,76 +1019,76 @@ unsigned char* stbi_write_png_to_mem(unsigned char* pixels, int stride_bytes, in
     signed char* line_buffer;
     int j, zlen;
 
-    if(stride_bytes == 0)
+    if (stride_bytes == 0)
         stride_bytes = x * n;
 
-    if(force_filter >= 5)
+    if (force_filter >= 5)
     {
         force_filter = -1;
     }
 
-    filt = ( unsigned char* )STBIW_MALLOC((x * n + 1) * (size_t)y);
-    if(!filt)
+    filt = (unsigned char*)STBIW_MALLOC((x * n + 1) * (size_t)y);
+    if (!filt)
         return 0;
-    line_buffer = ( signed char* )STBIW_MALLOC((size_t)x * n);
-    if(!line_buffer)
+    line_buffer = (signed char*)STBIW_MALLOC((size_t)x * n);
+    if (!line_buffer)
     {
         STBIW_FREE(filt);
         return 0;
     }
-    for(j = 0; j < y; ++j)
+    for (j = 0; j < y; ++j)
     {
         int filter_type;
-        if(force_filter > -1)
+        if (force_filter > -1)
         {
             filter_type = force_filter;
             stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, force_filter, line_buffer);
         }
         else
-        {    // Estimate the best filter by running through all of them:
+        { // Estimate the best filter by running through all of them:
             int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
-            for(filter_type = 0; filter_type < 5; filter_type++)
+            for (filter_type = 0; filter_type < 5; filter_type++)
             {
                 stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, filter_type, line_buffer);
 
                 // Estimate the entropy of the line using this filter; the less, the better.
                 est = 0;
-                for(i = 0; i < x * n; ++i)
+                for (i = 0; i < x * n; ++i)
                 {
-                    est += abs(( signed char )line_buffer[i]);
+                    est += abs((signed char)line_buffer[i]);
                 }
-                if(est < best_filter_val)
+                if (est < best_filter_val)
                 {
                     best_filter_val = est;
                     best_filter = filter_type;
                 }
             }
-            if(filter_type != best_filter)
-            {    // If the last iteration already got us the best filter, don't redo it
+            if (filter_type != best_filter)
+            { // If the last iteration already got us the best filter, don't redo it
                 stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, best_filter, line_buffer);
                 filter_type = best_filter;
             }
         }
         // when we get here, filter_type contains the filter type, and line_buffer contains the data
-        filt[j * (x * n + 1)] = ( unsigned char )filter_type;
+        filt[j * (x * n + 1)] = (unsigned char)filter_type;
         STBIW_MEMMOVE(filt + j * (x * n + 1) + 1, line_buffer, (size_t)x * n);
     }
     STBIW_FREE(line_buffer);
     zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen, stbi_write_png_compression_level);
     STBIW_FREE(filt);
-    if(!zlib)
+    if (!zlib)
         return 0;
 
     // each tag requires 12 bytes of overhead
-    out = ( unsigned char* )STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12);
-    if(!out)
+    out = (unsigned char*)STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12);
+    if (!out)
         return 0;
     *out_len = 8 + 12 + 13 + 12 + zlen + 12;
 
     o = out;
     STBIW_MEMMOVE(o, sig, 8);
     o += 8;
-    stbiw__wp32(o, 13);    // header length
+    stbiw__wp32(o, 13); // header length
     stbiw__wptag(o, "IHDR");
     stbiw__wp32(o, x);
     stbiw__wp32(o, y);
@@ -1120,16 +1120,16 @@ STBIWDEF int stbi_write_png(char const* filename, int x, int y, int comp, const
 {
     FILE* f;
     int len;
-    unsigned char* png = stbi_write_png_to_mem(( unsigned char* )data, stride_bytes, x, y, comp, &len);
-    if(png == NULL)
+    unsigned char* png = stbi_write_png_to_mem((unsigned char*)data, stride_bytes, x, y, comp, &len);
+    if (png == NULL)
         return 0;
 #ifdef STBI_MSC_SECURE_CRT
-    if(fopen_s(&f, filename, "wb"))
+    if (fopen_s(&f, filename, "wb"))
         f = NULL;
 #else
     f = fopen(filename, "wb");
 #endif
-    if(!f)
+    if (!f)
     {
         STBIW_FREE(png);
         return 0;
@@ -1145,8 +1145,8 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int x,
                                     int stride_bytes)
 {
     int len;
-    unsigned char* png = stbi_write_png_to_mem(( unsigned char* )data, stride_bytes, x, y, comp, &len);
-    if(png == NULL)
+    unsigned char* png = stbi_write_png_to_mem((unsigned char*)data, stride_bytes, x, y, comp, &len);
+    if (png == NULL)
         return 0;
     func(context, png, len);
     STBIW_FREE(png);
@@ -1161,8 +1161,8 @@ STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int x,
  * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
  */
 
-static const unsigned char stbiw__jpg_ZigZag[] = {0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42,
-                                                  3,  8,  12, 17, 25, 30, 41, 43, 9,  11, 18, 24, 31, 40, 44, 53,
+static const unsigned char stbiw__jpg_ZigZag[] = {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42,
+                                                  3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53,
                                                   10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
                                                   21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63};
 
@@ -1171,11 +1171,11 @@ static void stbiw__jpg_writeBits(stbi__write_context* s, int* bitBufP, int* bitC
     int bitBuf = *bitBufP, bitCnt = *bitCntP;
     bitCnt += bs[1];
     bitBuf |= bs[0] << (24 - bitCnt);
-    while(bitCnt >= 8)
+    while (bitCnt >= 8)
     {
         unsigned char c = (bitBuf >> 16) & 255;
         stbiw__putc(s, c);
-        if(c == 255)
+        if (c == 255)
         {
             stbiw__putc(s, 0);
         }
@@ -1202,33 +1202,33 @@ static void stbiw__jpg_DCT(float* d0p, float* d1p, float* d2p, float* d3p, float
     float tmp4 = d3 - d4;
 
     // Even part
-    float tmp10 = tmp0 + tmp3;    // phase 2
+    float tmp10 = tmp0 + tmp3; // phase 2
     float tmp13 = tmp0 - tmp3;
     float tmp11 = tmp1 + tmp2;
     float tmp12 = tmp1 - tmp2;
 
-    d0 = tmp10 + tmp11;    // phase 3
+    d0 = tmp10 + tmp11; // phase 3
     d4 = tmp10 - tmp11;
 
-    z1 = (tmp12 + tmp13) * 0.707106781f;    // c4
-    d2 = tmp13 + z1;    // phase 5
+    z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+    d2 = tmp13 + z1;                     // phase 5
     d6 = tmp13 - z1;
 
     // Odd part
-    tmp10 = tmp4 + tmp5;    // phase 2
+    tmp10 = tmp4 + tmp5; // phase 2
     tmp11 = tmp5 + tmp6;
     tmp12 = tmp6 + tmp7;
 
     // The rotator is modified from fig 4-8 to avoid extra negations.
-    z5 = (tmp10 - tmp12) * 0.382683433f;    // c6
-    z2 = tmp10 * 0.541196100f + z5;    // c2-c6
-    z4 = tmp12 * 1.306562965f + z5;    // c2+c6
-    z3 = tmp11 * 0.707106781f;    // c4
+    z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+    z2 = tmp10 * 0.541196100f + z5;      // c2-c6
+    z4 = tmp12 * 1.306562965f + z5;      // c2+c6
+    z3 = tmp11 * 0.707106781f;           // c4
 
-    z11 = tmp7 + z3;    // phase 5
+    z11 = tmp7 + z3; // phase 5
     z13 = tmp7 - z3;
 
-    *d5p = z13 + z2;    // phase 6
+    *d5p = z13 + z2; // phase 6
     *d3p = z13 - z2;
     *d1p = z11 + z4;
     *d7p = z11 - z4;
@@ -1244,7 +1244,7 @@ static void stbiw__jpg_calcBits(int val, unsigned short bits[2])
     int tmp1 = val < 0 ? -val : val;
     val = val < 0 ? val - 1 : val;
     bits[1] = 1;
-    while(tmp1 >>= 1)
+    while (tmp1 >>= 1)
     {
         ++bits[1];
     }
@@ -1260,29 +1260,29 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt
     int DU[64];
 
     // DCT rows
-    for(dataOff = 0; dataOff < 64; dataOff += 8)
+    for (dataOff = 0; dataOff < 64; dataOff += 8)
     {
         stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 1], &CDU[dataOff + 2], &CDU[dataOff + 3], &CDU[dataOff + 4],
                        &CDU[dataOff + 5], &CDU[dataOff + 6], &CDU[dataOff + 7]);
     }
     // DCT columns
-    for(dataOff = 0; dataOff < 8; ++dataOff)
+    for (dataOff = 0; dataOff < 8; ++dataOff)
     {
         stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 8], &CDU[dataOff + 16], &CDU[dataOff + 24], &CDU[dataOff + 32],
                        &CDU[dataOff + 40], &CDU[dataOff + 48], &CDU[dataOff + 56]);
     }
     // Quantize/descale/zigzag the coefficients
-    for(i = 0; i < 64; ++i)
+    for (i = 0; i < 64; ++i)
     {
         float v = CDU[i] * fdtbl[i];
         // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
-        DU[stbiw__jpg_ZigZag[i]] = ( int )(v < 0 ? v - 0.5f : v + 0.5f);
+        DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
     }
 
     // Encode DC
     diff = DU[0] - DC;
-    if(diff == 0)
+    if (diff == 0)
     {
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
     }
@@ -1295,29 +1295,29 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt
     }
     // Encode ACs
     end0pos = 63;
-    for(; (end0pos > 0) && (DU[end0pos] == 0); --end0pos)
+    for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos)
     {
     }
     // end0pos = first element in reverse order !=0
-    if(end0pos == 0)
+    if (end0pos == 0)
     {
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
         return DU[0];
     }
-    for(i = 1; i <= end0pos; ++i)
+    for (i = 1; i <= end0pos; ++i)
     {
         int startpos = i;
         int nrzeroes;
         unsigned short bits[2];
-        for(; DU[i] == 0 && i <= end0pos; ++i)
+        for (; DU[i] == 0 && i <= end0pos; ++i)
         {
         }
         nrzeroes = i - startpos;
-        if(nrzeroes >= 16)
+        if (nrzeroes >= 16)
         {
             int lng = nrzeroes >> 4;
             int nrmarker;
-            for(nrmarker = 1; nrmarker <= lng; ++nrmarker)
+            for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
                 stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
             nrzeroes &= 15;
         }
@@ -1325,7 +1325,7 @@ static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes << 4) + bits[1]]);
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
     }
-    if(end0pos != 63)
+    if (end0pos != 63)
     {
         stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
     }
@@ -1362,111 +1362,50 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in
         0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
         0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa};
     // Huffman tables
-    static const unsigned short YDC_HT[256][2] = {{0, 2},  {2, 3},  {3, 3},  {4, 3},   {5, 3},   {6, 3},
-                                                  {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}};
-    static const unsigned short UVDC_HT[256][2] = {{0, 2},  {1, 2},   {2, 2},   {6, 3},   {14, 4},    {30, 5},
-                                                   {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}};
+    static const unsigned short YDC_HT[256][2] = {{0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}};
+    static const unsigned short UVDC_HT[256][2] = {{0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}};
     static const unsigned short YAC_HT[256][2] = {
-        {10, 4},     {0, 2},      {1, 2},      {4, 3},      {11, 4},     {26, 5},     {120, 7},    {248, 8},
-        {1014, 10},  {65410, 16}, {65411, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {12, 4},     {27, 5},     {121, 7},    {502, 9},    {2038, 11},  {65412, 16}, {65413, 16},
-        {65414, 16}, {65415, 16}, {65416, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {28, 5},     {249, 8},    {1015, 10},  {4084, 12},  {65417, 16}, {65418, 16}, {65419, 16},
-        {65420, 16}, {65421, 16}, {65422, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {58, 6},     {503, 9},    {4085, 12},  {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16},
-        {65427, 16}, {65428, 16}, {65429, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {59, 6},     {1016, 10},  {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16},
-        {65435, 16}, {65436, 16}, {65437, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {122, 7},    {2039, 11},  {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16},
-        {65443, 16}, {65444, 16}, {65445, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {123, 7},    {4086, 12},  {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16},
-        {65451, 16}, {65452, 16}, {65453, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {250, 8},    {4087, 12},  {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16},
-        {65459, 16}, {65460, 16}, {65461, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {504, 9},    {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16},
-        {65467, 16}, {65468, 16}, {65469, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {505, 9},    {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16},
-        {65476, 16}, {65477, 16}, {65478, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {506, 9},    {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16},
-        {65485, 16}, {65486, 16}, {65487, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {1017, 10},  {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16},
-        {65494, 16}, {65495, 16}, {65496, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {1018, 10},  {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16},
-        {65503, 16}, {65504, 16}, {65505, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {2040, 11},  {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16},
-        {65512, 16}, {65513, 16}, {65514, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16},
-        {65522, 16}, {65523, 16}, {65524, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {2041, 11},  {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16},
-        {65532, 16}, {65533, 16}, {65534, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0}};
+        {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}};
     static const unsigned short UVAC_HT[256][2] = {
-        {0, 2},      {1, 2},      {4, 3},      {10, 4},     {24, 5},     {25, 5},     {56, 6},     {120, 7},
-        {500, 9},    {1014, 10},  {4084, 12},  {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {11, 4},     {57, 6},     {246, 8},    {501, 9},    {2038, 11},  {4085, 12},  {65416, 16},
-        {65417, 16}, {65418, 16}, {65419, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {26, 5},     {247, 8},    {1015, 10},  {4086, 12},  {32706, 15}, {65420, 16}, {65421, 16},
-        {65422, 16}, {65423, 16}, {65424, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {27, 5},     {248, 8},    {1016, 10},  {4087, 12},  {65425, 16}, {65426, 16}, {65427, 16},
-        {65428, 16}, {65429, 16}, {65430, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {58, 6},     {502, 9},    {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16},
-        {65436, 16}, {65437, 16}, {65438, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {59, 6},     {1017, 10},  {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16},
-        {65444, 16}, {65445, 16}, {65446, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {121, 7},    {2039, 11},  {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16},
-        {65452, 16}, {65453, 16}, {65454, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {122, 7},    {2040, 11},  {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16},
-        {65460, 16}, {65461, 16}, {65462, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {249, 8},    {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16},
-        {65469, 16}, {65470, 16}, {65471, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {503, 9},    {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16},
-        {65478, 16}, {65479, 16}, {65480, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {504, 9},    {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16},
-        {65487, 16}, {65488, 16}, {65489, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {505, 9},    {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16},
-        {65496, 16}, {65497, 16}, {65498, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {506, 9},    {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16},
-        {65505, 16}, {65506, 16}, {65507, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {2041, 11},  {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16},
-        {65514, 16}, {65515, 16}, {65516, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {0, 0},      {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16},
-        {65523, 16}, {65524, 16}, {65525, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-        {1018, 10},  {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16},
-        {65532, 16}, {65533, 16}, {65534, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0}};
-    static const int YQT[] = {16, 11, 10, 16, 24,  40,  51,  61,  12, 12, 14, 19, 26,  58,  60,  55,
-                              14, 13, 16, 24, 40,  57,  69,  56,  14, 17, 22, 29, 51,  87,  80,  62,
-                              18, 22, 37, 56, 68,  109, 103, 77,  24, 35, 55, 64, 81,  104, 113, 92,
+        {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}};
+    static const int YQT[] = {16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55,
+                              14, 13, 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62,
+                              18, 22, 37, 56, 68, 109, 103, 77, 24, 35, 55, 64, 81, 104, 113, 92,
                               49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99};
     static const int UVQT[] = {17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, 26, 56, 99, 99, 99,
                                99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
                                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99};
-    static const float aasf[] = {1.0f * 2.828427125f,         1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f,
-                                 1.175875602f * 2.828427125f, 1.0f * 2.828427125f,         0.785694958f * 2.828427125f,
+    static const float aasf[] = {1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f,
+                                 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 0.785694958f * 2.828427125f,
                                  0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f};
 
     int row, col, i, k;
     float fdtbl_Y[64], fdtbl_UV[64];
     unsigned char YTable[64], UVTable[64];
 
-    if(!data || !width || !height || comp > 4 || comp < 1)
+    if (!data || !width || !height || comp > 4 || comp < 1)
     {
         return 0;
     }
 
     quality = quality ? quality : 90;
-    quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+    quality = quality < 1 ? 1 : quality > 100 ? 100
+                                              : quality;
     quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
 
-    for(i = 0; i < 64; ++i)
+    for (i = 0; i < 64; ++i)
     {
         int uvti, yti = (YQT[i] * quality + 50) / 100;
-        YTable[stbiw__jpg_ZigZag[i]] = ( unsigned char )(yti < 1 ? 1 : yti > 255 ? 255 : yti);
+        YTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(yti < 1 ? 1 : yti > 255 ? 255
+                                                                               : yti);
         uvti = (UVQT[i] * quality + 50) / 100;
-        UVTable[stbiw__jpg_ZigZag[i]] = ( unsigned char )(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+        UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(uvti < 1 ? 1 : uvti > 255 ? 255
+                                                                                  : uvti);
     }
 
-    for(row = 0, k = 0; row < 8; ++row)
+    for (row = 0, k = 0; row < 8; ++row)
     {
-        for(col = 0; col < 8; ++col, ++k)
+        for (col = 0; col < 8; ++col, ++k)
         {
             fdtbl_Y[k] = 1 / (YTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
             fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
@@ -1475,17 +1414,17 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in
 
     // Write Headers
     {
-        static const unsigned char head0[] = {0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F',  'I',  'F', 0,    1, 1,
-                                              0,    0,    1,    0,    1, 0,    0,   0xFF, 0xDB, 0,   0x84, 0};
+        static const unsigned char head0[] = {0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1,
+                                              0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0};
         static const unsigned char head2[] = {0xFF, 0xDA, 0, 0xC, 3, 1, 0, 2, 0x11, 3, 0x11, 0, 0x3F, 0};
         const unsigned char head1[] = {0xFF,
                                        0xC0,
                                        0,
                                        0x11,
                                        8,
-                                       ( unsigned char )(height >> 8),
+                                       (unsigned char)(height >> 8),
                                        STBIW_UCHAR(height),
-                                       ( unsigned char )(width >> 8),
+                                       (unsigned char)(width >> 8),
                                        STBIW_UCHAR(width),
                                        3,
                                        1,
@@ -1502,50 +1441,50 @@ static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, in
                                        0x01,
                                        0xA2,
                                        0};
-        s->func(s->context, ( void* )head0, sizeof(head0));
-        s->func(s->context, ( void* )YTable, sizeof(YTable));
+        s->func(s->context, (void*)head0, sizeof(head0));
+        s->func(s->context, (void*)YTable, sizeof(YTable));
         stbiw__putc(s, 1);
         s->func(s->context, UVTable, sizeof(UVTable));
-        s->func(s->context, ( void* )head1, sizeof(head1));
-        s->func(s->context, ( void* )(std_dc_luminance_nrcodes + 1), sizeof(std_dc_luminance_nrcodes) - 1);
-        s->func(s->context, ( void* )std_dc_luminance_values, sizeof(std_dc_luminance_values));
-        stbiw__putc(s, 0x10);    // HTYACinfo
-        s->func(s->context, ( void* )(std_ac_luminance_nrcodes + 1), sizeof(std_ac_luminance_nrcodes) - 1);
-        s->func(s->context, ( void* )std_ac_luminance_values, sizeof(std_ac_luminance_values));
-        stbiw__putc(s, 1);    // HTUDCinfo
-        s->func(s->context, ( void* )(std_dc_chrominance_nrcodes + 1), sizeof(std_dc_chrominance_nrcodes) - 1);
-        s->func(s->context, ( void* )std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
-        stbiw__putc(s, 0x11);    // HTUACinfo
-        s->func(s->context, ( void* )(std_ac_chrominance_nrcodes + 1), sizeof(std_ac_chrominance_nrcodes) - 1);
-        s->func(s->context, ( void* )std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
-        s->func(s->context, ( void* )head2, sizeof(head2));
+        s->func(s->context, (void*)head1, sizeof(head1));
+        s->func(s->context, (void*)(std_dc_luminance_nrcodes + 1), sizeof(std_dc_luminance_nrcodes) - 1);
+        s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+        stbiw__putc(s, 0x10); // HTYACinfo
+        s->func(s->context, (void*)(std_ac_luminance_nrcodes + 1), sizeof(std_ac_luminance_nrcodes) - 1);
+        s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+        stbiw__putc(s, 1); // HTUDCinfo
+        s->func(s->context, (void*)(std_dc_chrominance_nrcodes + 1), sizeof(std_dc_chrominance_nrcodes) - 1);
+        s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+        stbiw__putc(s, 0x11); // HTUACinfo
+        s->func(s->context, (void*)(std_ac_chrominance_nrcodes + 1), sizeof(std_ac_chrominance_nrcodes) - 1);
+        s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+        s->func(s->context, (void*)head2, sizeof(head2));
     }
 
     // Encode 8x8 macroblocks
     {
         static const unsigned short fillBits[] = {0x7F, 7};
-        const unsigned char* imageData = ( const unsigned char* )data;
+        const unsigned char* imageData = (const unsigned char*)data;
         int DCY = 0, DCU = 0, DCV = 0;
         int bitBuf = 0, bitCnt = 0;
         // comp == 2 is grey+alpha (alpha is ignored)
         int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
         int x, y, pos;
-        for(y = 0; y < height; y += 8)
+        for (y = 0; y < height; y += 8)
         {
-            for(x = 0; x < width; x += 8)
+            for (x = 0; x < width; x += 8)
             {
                 float YDU[64], UDU[64], VDU[64];
-                for(row = y, pos = 0; row < y + 8; ++row)
+                for (row = y, pos = 0; row < y + 8; ++row)
                 {
-                    for(col = x; col < x + 8; ++col, ++pos)
+                    for (col = x; col < x + 8; ++col, ++pos)
                     {
                         int p = (stbi__flip_vertically_on_write ? height - 1 - row : row) * width * comp + col * comp;
                         float r, g, b;
-                        if(row >= height)
+                        if (row >= height)
                         {
                             p -= width * comp * (row + 1 - height);
                         }
-                        if(col >= width)
+                        if (col >= width)
                         {
                             p -= comp * (col + 1 - width);
                         }
@@ -1581,14 +1520,14 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x,
 {
     stbi__write_context s;
     stbi__start_write_callbacks(&s, func, context);
-    return stbi_write_jpg_core(&s, x, y, comp, ( void* )data, quality);
+    return stbi_write_jpg_core(&s, x, y, comp, (void*)data, quality);
 }
 
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const void* data, int quality)
 {
     stbi__write_context s;
-    if(stbi__start_write_file(&s, filename))
+    if (stbi__start_write_file(&s, filename))
     {
         int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
         stbi__end_write_file(&s);
@@ -1599,7 +1538,7 @@ STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const
 }
 #endif
 
-#endif    // STB_IMAGE_WRITE_IMPLEMENTATION
+#endif // STB_IMAGE_WRITE_IMPLEMENTATION
 
 /* Revision history
       1.09  (2018-02-11)
diff --git a/tests/common/tengine_operations.c b/tests/common/tengine_operations.c
index ca621cb07..3f2c885bf 100644
--- a/tests/common/tengine_operations.c
+++ b/tests/common/tengine_operations.c
@@ -71,7 +71,7 @@ image load_image_stb(const char* filename, int channels)
             {
                 int dst_index = i + w * j + w * h * k;
                 int src_index = k + src_c * i + src_c * w * j;
-                im.data[dst_index] = ( float )data[src_index];
+                im.data[dst_index] = (float)data[src_index];
             }
         }
     }
@@ -83,7 +83,7 @@ image load_image_stb(const char* filename, int channels)
 image make_image(int w, int h, int c)
 {
     image out = make_empty_image(w, h, c);
-    out.data = ( float* )calloc((size_t)h * w * c, sizeof(float));
+    out.data = (float*)calloc((size_t)h * w * c, sizeof(float));
     return out;
 }
 
@@ -125,17 +125,17 @@ image imread_process(const char* filename, int img_w, int img_h, float* means, f
 
     switch (choice)
     {
-        case 0:
-            out = gray2bgr(out);
-            break;
-        case 1:
-            out = rgb2gray(out);
-            break;
-        case 2:
-            out = rgb2bgr_permute(out);
-            break;
-        default:
-            break;
+    case 0:
+        out = gray2bgr(out);
+        break;
+    case 1:
+        out = rgb2gray(out);
+        break;
+    case 2:
+        out = rgb2bgr_permute(out);
+        break;
+    default:
+        break;
     }
 
     image resImg = make_image(img_w, img_h, out.c);
@@ -171,8 +171,8 @@ image resize_image(image im, int ow, int oh)
     int h = im.h;
     int w = im.w;
     float shift = 0.f;
-    float _scale_x = ( float )((w - shift) / (ow - shift));
-    float _scale_y = ( float )((h - shift) / (oh - shift));
+    float _scale_x = (float)((w - shift) / (ow - shift));
+    float _scale_y = (float)((h - shift) / (oh - shift));
     float32x4_t scale_x = vdupq_n_f32(_scale_x);
     float offset = 0.5;
     int in_hw = h * w;
@@ -215,8 +215,7 @@ image resize_image(image im, int ow, int oh)
 
                 float32x4_t fx_0 = vsubq_f32(offset_1, fx);
 
-                const int32x4_t in_idx =
-                    vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0));
+                const int32x4_t in_idx = vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0));
                 int32x4_t in_index0 = in_idx;
                 int32x4_t in_index2 = vaddq_s32(in_idx, vcvtq_s32_f32(offset_1));
                 int32x4_t in_index1 = vaddq_s32(in_idx, w_0);
@@ -290,8 +289,8 @@ image resize_image(image im, int ow, int oh)
     int h = im.h;
     int w = im.w;
     float shift = 0.f;
-    float _scale_x = ( float )((w - shift) / (ow - shift));
-    float _scale_y = ( float )((h - shift) / (oh - shift));
+    float _scale_x = (float)((w - shift) / (ow - shift));
+    float _scale_y = (float)((h - shift) / (oh - shift));
 
     float32x4_t scale_x = vdupq_n_f32(_scale_x);
     float offset = 0.5;
@@ -335,8 +334,7 @@ image resize_image(image im, int ow, int oh)
 
                 float32x4_t fx_0 = vsubq_f32(offset_1, fx);
 
-                const int32x4_t in_idx =
-                    vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0));
+                const int32x4_t in_idx = vaddq_s32(vaddq_s32(vmulq_s32(sy_0, w_0), vcvtq_s32_f32(sx)), vmulq_s32(in_hw_0, k_0));
 
                 int32x4_t in_index0 = in_idx;
                 int32x4_t in_index2 = vaddq_s32(in_idx, vcvtq_s32_f32(offset_1));
@@ -408,8 +406,8 @@ image resize_image(image im, int ow, int oh)
 #endif
 
 #else
-    float scale_x = ( float )(im.w) / (ow);
-    float scale_y = ( float )(im.h) / (oh);
+    float scale_x = (float)(im.w) / (ow);
+    float scale_y = (float)(im.h) / (oh);
     int w = im.w;
     int h = im.h;
     int in_hw = h * w;
@@ -481,13 +479,13 @@ image copyMaker(image im, int top, int bottom, int left, int right, float value)
 void save_image(image im, const char* name)
 {
     char buff[256];
-    unsigned char* data = ( unsigned char* )calloc((size_t)im.w * im.h * im.c, sizeof(char));
+    unsigned char* data = (unsigned char*)calloc((size_t)im.w * im.h * im.c, sizeof(char));
     int i, k;
     for (k = 0; k < im.c; ++k)
     {
         for (i = 0; i < im.w * im.h; ++i)
         {
-            data[i * im.c + k] = ( unsigned char )(im.data[i + k * im.w * im.h]);
+            data[i * im.c + k] = (unsigned char)(im.data[i + k * im.w * im.h]);
         }
     }
 
@@ -505,24 +503,24 @@ void save_image(image im, const char* name)
 
     switch (f)
     {
-        case 0:
-            sprintf(buff, "%s.jpg", name);
-            success = stbi_write_jpg(buff, im.w, im.h, im.c, data, 80);
-            break;
-        case 1:
-            sprintf(buff, "%s.png", name);
-            success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w * im.c);
-            break;
-        case 2:
-            sprintf(buff, "%s.tga", name);
-            success = stbi_write_tga(buff, im.w, im.h, im.c, data);
-            break;
-        case 3:
-            sprintf(buff, "%s.bmp", name);
-            success = stbi_write_bmp(buff, im.w, im.h, im.c, data);
-            break;
-        default:
-            return;
+    case 0:
+        sprintf(buff, "%s.jpg", name);
+        success = stbi_write_jpg(buff, im.w, im.h, im.c, data, 80);
+        break;
+    case 1:
+        sprintf(buff, "%s.png", name);
+        success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w * im.c);
+        break;
+    case 2:
+        sprintf(buff, "%s.tga", name);
+        success = stbi_write_tga(buff, im.w, im.h, im.c, data);
+        break;
+    case 3:
+        sprintf(buff, "%s.bmp", name);
+        success = stbi_write_bmp(buff, im.w, im.h, im.c, data);
+        break;
+    default:
+        return;
     }
     free(data);
     if (!success)
@@ -588,7 +586,7 @@ static float get_pixelBychannel(image m, int x, int y, int c)
 image copy_image(image p)
 {
     image copy = p;
-    copy.data = ( float* )calloc((size_t)p.h * p.w * p.c, sizeof(float));
+    copy.data = (float*)calloc((size_t)p.h * p.w * p.c, sizeof(float));
     memcpy(copy.data, p.data, (unsigned long)p.h * p.w * p.c * sizeof(float));
     return copy;
 }
@@ -644,7 +642,8 @@ image imread2post(const char* filename)
 {
     image im = load_image_stb(filename, 0);
     const int len = im.c * im.h * im.w;
-    for (int i = 0; i < len; ++i) {
+    for (int i = 0; i < len; ++i)
+    {
         im.data[i] *= 255;
     }
     return im;
@@ -653,20 +652,21 @@ image imread2post(const char* filename)
 image rgb2bgr_permute(image src)
 {
     const int len = src.c * src.h * src.w;
-    float* GRB = ( float* )malloc(sizeof(float) * len);
+    float* GRB = (float*)malloc(sizeof(float) * len);
     for (int c = 0; c < src.c; c++)
     {
         for (int h = 0; h < src.h; h++)
         {
             for (int w = 0; w < src.w; w++)
             {
-                int newIndex = ( c )*src.h * src.w + h * src.w + w;
+                int newIndex = (c)*src.h * src.w + h * src.w + w;
                 int grbIndex = (2 - c) * src.h * src.w + h * src.w + w;
                 GRB[grbIndex] = src.data[newIndex];
             }
         }
     }
-    for (int i = 0; i < len; ++i) {
+    for (int i = 0; i < len; ++i)
+    {
         src.data[i] = GRB[i];
     }
     free(GRB);
@@ -675,14 +675,14 @@ image rgb2bgr_permute(image src)
 
 image image_permute(image src)
 {
-    float* GRB = ( float* )malloc(sizeof(float) * src.c * src.h * src.w);
+    float* GRB = (float*)malloc(sizeof(float) * src.c * src.h * src.w);
     for (int c = 0; c < src.c; c++)
     {
         for (int h = 0; h < src.h; h++)
         {
             for (int w = 0; w < src.w; w++)
             {
-                int newIndex = ( c )*src.h * src.w + h * src.w + w;
+                int newIndex = (c)*src.h * src.w + h * src.w + w;
                 int grbIndex = (2 - c) * src.h * src.w + h * src.w + w;
                 GRB[grbIndex] = src.data[newIndex];
             }
@@ -698,7 +698,7 @@ image gray2bgr(image src)
     res.c = 3;
     res.h = src.h;
     res.w = src.w;
-    res.data = ( float* )malloc(sizeof(float) * 3 * src.h * src.w);
+    res.data = (float*)malloc(sizeof(float) * 3 * src.h * src.w);
     for (int x = 0; x < src.h; x++)
     {
         for (int y = 0; y < src.w; y++)
@@ -716,7 +716,7 @@ image gray2bgr(image src)
 image tranpose(image src)
 {
     int size = src.c * src.h * src.w;
-    float* tempData = ( float* )malloc(sizeof(float) * size);
+    float* tempData = (float*)malloc(sizeof(float) * size);
     int index = 0;
 
     for (int c = 0; c < src.c; c++)
@@ -813,7 +813,7 @@ image rgb2gray(image src)
     res.h = src.h;
     res.w = src.w;
     res.c = 1;
-    res.data = ( float* )malloc(sizeof(float) * res.h * res.w);
+    res.data = (float*)malloc(sizeof(float) * res.h * res.w);
     for (int i = 0; i < res.h; i++)
     {
         for (int j = 0; j < res.w; j++)
@@ -840,7 +840,7 @@ image letterbox(image im, int w, int h)
 {
     int ow = im.w;
     int oh = im.h;
-    if ((( float )w / im.w) < (( float )h / im.h))
+    if (((float)w / im.w) < ((float)h / im.h))
     {
         ow = w;
         oh = (im.h * w) / im.w;
@@ -855,7 +855,7 @@ image letterbox(image im, int w, int h)
     boxed.w = w;
     boxed.h = h;
     boxed.c = im.c;
-    boxed.data = ( float* )malloc(sizeof(float) * im.c * h * w);
+    boxed.data = (float*)malloc(sizeof(float) * im.c * h * w);
 
     for (int i = 0; i < boxed.c * boxed.h * boxed.w; i++)
     {
@@ -870,20 +870,20 @@ image letterbox(image im, int w, int h)
 
 void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, int w)
 {
-    float _scale_x = ( float )(w) / ( float )(ow);
-    float _scale_y = ( float )(h) / ( float )(oh);
+    float _scale_x = (float)(w) / (float)(ow);
+    float _scale_y = (float)(h) / (float)(oh);
     float offset = 0.5f;
 
-    int16_t* buf = ( int16_t* )malloc((ow + ow + ow + oh + oh + oh) * sizeof(int16_t));
-    int16_t* xCoef = ( int16_t* )(buf);
-    int16_t* xPos = ( int16_t* )(buf + ow + ow);
-    int16_t* yCoef = ( int16_t* )(buf + ow + ow + ow);
-    int16_t* yPos = ( int16_t* )(buf + ow + ow + ow + oh + oh);
+    int16_t* buf = (int16_t*)malloc((ow + ow + ow + oh + oh + oh) * sizeof(int16_t));
+    int16_t* xCoef = (int16_t*)(buf);
+    int16_t* xPos = (int16_t*)(buf + ow + ow);
+    int16_t* yCoef = (int16_t*)(buf + ow + ow + ow);
+    int16_t* yPos = (int16_t*)(buf + ow + ow + ow + oh + oh);
 
     for (int i = 0; i < ow; i++)
     {
-        float fx = ( float )((( float )i + offset) * _scale_x - offset);
-        int sx = ( int )fx;
+        float fx = (float)(((float)i + offset) * _scale_x - offset);
+        int sx = (int)fx;
         fx -= sx;
         if (sx < 0)
         {
@@ -902,8 +902,8 @@ void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, i
 
     for (int j = 0; j < oh; j++)
     {
-        float fy = ( float )((( float )j + offset) * _scale_y - offset);
-        int sy = ( int )fy;
+        float fy = (float)(((float)j + offset) * _scale_y - offset);
+        int sy = (int)fy;
         fy -= sy;
         if (sy < 0)
         {
@@ -921,7 +921,7 @@ void tengine_resize_f32(float* data, float* res, int ow, int oh, int c, int h, i
     }
 
     //    int32_t* row = new int32_t[ow + ow];
-    int32_t* row = ( int32_t* )malloc((ow + ow) * sizeof(int32_t));
+    int32_t* row = (int32_t*)malloc((ow + ow) * sizeof(int32_t));
 
     for (int k = 0; k < c; k++)
     {
@@ -1021,7 +1021,7 @@ static void sort_cls_score(cls_score* array, int left, int right)
 
 void print_topk(float* data, int total_num, int topk)
 {
-    cls_score* cls_scores = ( cls_score* )malloc(total_num * sizeof(cls_score));
+    cls_score* cls_scores = (cls_score*)malloc(total_num * sizeof(cls_score));
     for (int i = 0; i < total_num; i++)
     {
         cls_scores[i].id = i;
diff --git a/tests/common/util/mathp.c b/tests/common/util/mathp.c
index c1e5933cd..52a78aa18 100644
--- a/tests/common/util/mathp.c
+++ b/tests/common/util/mathp.c
@@ -27,38 +27,32 @@
 
 #include <stdlib.h>
 
-
 int imin(int a, int b)
 {
     return a <= b ? a : b;
 }
 
-
 int imax(int a, int b)
 {
     return a >= b ? a : b;
 }
 
-
 int min_abs(int a, int b)
 {
     return imin(abs(a), abs(b));
 }
 
-
 int max_abs(int a, int b)
 {
     return imax(abs(a), abs(b));
 }
 
-
 static int solve_gcd(int large, int small)
 {
     int val = large % small;
     return 0 == val ? small : gcd(small, val);
 }
 
-
 int gcd(int a, int b)
 {
     if (0 == a || 0 == b)
@@ -67,7 +61,6 @@ int gcd(int a, int b)
     return solve_gcd(max_abs(a, b), min_abs(a, b));
 }
 
-
 int lcm(int a, int b)
 {
     if (0 == a || 0 == b)
@@ -76,14 +69,12 @@ int lcm(int a, int b)
     return abs(a * b) / solve_gcd(max_abs(a, b), min_abs(a, b));
 }
 
-
 int align(int value, int step)
 {
     const int mask = ~(abs(step) - 1);
     return (value + step) & mask;
 }
 
-
 void* align_address(void* address, int step)
 {
     const size_t mask = ~(abs(step) - 1);
diff --git a/tests/common/util/mathp.h b/tests/common/util/mathp.h
index 672ddcdc1..16a7c5d9d 100644
--- a/tests/common/util/mathp.h
+++ b/tests/common/util/mathp.h
@@ -25,7 +25,6 @@
 
 #pragma once
 
-
 /*!
  * @brief  Solve min value
  *
@@ -36,7 +35,6 @@
  */
 int imin(int a, int b);
 
-
 /*!
  * @brief  Solve max value
  *
@@ -47,7 +45,6 @@ int imin(int a, int b);
  */
 int imax(int a, int b);
 
-
 /*!
  * @brief  Solve min absolute value
  *
@@ -58,7 +55,6 @@ int imax(int a, int b);
  */
 int min_abs(int a, int b);
 
-
 /*!
  * @brief  Solve max absolute value
  *
@@ -69,7 +65,6 @@ int min_abs(int a, int b);
  */
 int max_abs(int a, int b);
 
-
 /*!
  * @brief  Solve greatest common divisor
  *
@@ -80,7 +75,6 @@ int max_abs(int a, int b);
  */
 int gcd(int a, int b);
 
-
 /*!
  * @brief  Solve lowest common multiple
  *
@@ -91,7 +85,6 @@ int gcd(int a, int b);
  */
 int lcm(int a, int b);
 
-
 /*!
  * @brief  Solve min aligned value with the step length
  *
@@ -102,7 +95,6 @@ int lcm(int a, int b);
  */
 int align(int value, int step);
 
-
 /*!
  * @brief  Get aligned pointer
  *
diff --git a/tests/common/util/vector.c b/tests/common/util/vector.c
index c4bfd87f6..636009936 100644
--- a/tests/common/util/vector.c
+++ b/tests/common/util/vector.c
@@ -31,25 +31,22 @@
 
 #include <string.h>
 
-
 typedef struct vector_entry
 {
     int valid;
     unsigned char data[];
 } vector_entry_t;
 
-
 static inline vector_entry_t* get_vector_entry(vector_t* v, int idx)
 {
     return (vector_entry_t*)((char*)v->mem + v->entry_size * idx);
 }
 
-
 static inline void free_vector_data_resource(vector_t* v, int idx)
 {
     vector_entry_t* e = get_vector_entry(v, idx);
 
-    if(e->valid && v->free_func)
+    if (e->valid && v->free_func)
     {
         v->free_func(e->data);
     }
@@ -57,7 +54,6 @@ static inline void free_vector_data_resource(vector_t* v, int idx)
     e->valid = 0;
 }
 
-
 static inline void remove_vector_data_not_tail(vector_t* v, int idx)
 {
     vector_entry_t* entry_ptr = NULL;
@@ -78,7 +74,6 @@ static inline void remove_vector_data_not_tail(vector_t* v, int idx)
     entry_ptr->valid = 0;
 }
 
-
 vector_t* create_vector(int elem_size, void (*free_data)(void*))
 {
     vector_t* v = (vector_t*)malloc(sizeof(vector_t));
@@ -109,7 +104,6 @@ vector_t* create_vector(int elem_size, void (*free_data)(void*))
     return v;
 }
 
-
 void release_vector(vector_t* v)
 {
     for (int i = 0; i < v->elem_num; i++)
@@ -121,7 +115,6 @@ void release_vector(vector_t* v)
     free(v);
 }
 
-
 int get_vector_num(vector_t* v)
 {
     if (NULL != v)
@@ -132,7 +125,6 @@ int get_vector_num(vector_t* v)
     return 0;
 }
 
-
 int resize_vector(vector_t* v, int new_size)
 {
     void* new_mem = NULL;
@@ -162,7 +154,7 @@ int resize_vector(vector_t* v, int new_size)
     }
 
     v->real_mem = new_mem;
-    v->mem = ( void* )(((size_t)(v->real_mem)) & (~(TE_VECTOR_ALIGN_SIZE - 1)));
+    v->mem = (void*)(((size_t)(v->real_mem)) & (~(TE_VECTOR_ALIGN_SIZE - 1)));
 
     for (int i = v->space_num; i < new_size; i++)
     {
@@ -175,10 +167,9 @@ int resize_vector(vector_t* v, int new_size)
     return 0;
 }
 
-
 int push_vector_data(vector_t* v, void* data)
 {
-    if(v->elem_num == v->space_num && resize_vector(v, v->elem_num + v->ahead_num) < 0)
+    if (v->elem_num == v->space_num && resize_vector(v, v->elem_num + v->ahead_num) < 0)
     {
         return -1;
     }
@@ -189,12 +180,11 @@ int push_vector_data(vector_t* v, void* data)
     return 0;
 }
 
-
 int set_vector_data(vector_t* v, int idx, void* data)
 {
     vector_entry_t* e = NULL;
 
-    if(idx >= v->elem_num)
+    if (idx >= v->elem_num)
         return -1;
 
     free_vector_data_resource(v, idx);
@@ -207,10 +197,9 @@ int set_vector_data(vector_t* v, int idx, void* data)
     return 0;
 }
 
-
 void* get_vector_data(vector_t* v, int index)
 {
-    if(index >= v->elem_num)
+    if (index >= v->elem_num)
     {
         return NULL;
     }
@@ -220,7 +209,6 @@ void* get_vector_data(vector_t* v, int index)
     return e->data;
 }
 
-
 int remove_vector_via_pointer(vector_t* v, void* data)
 {
     const int count = v->elem_num;
@@ -245,11 +233,10 @@ int remove_vector_via_pointer(vector_t* v, void* data)
     return 0;
 }
 
-
 void remove_vector_via_index(vector_t* v, int idx)
 {
     // the last one
-    if(idx == v->elem_num - 1)
+    if (idx == v->elem_num - 1)
     {
         free_vector_data_resource(v, idx);
         v->elem_num--;
diff --git a/tests/common/util/vector.h b/tests/common/util/vector.h
index ef7a97906..959985e11 100644
--- a/tests/common/util/vector.h
+++ b/tests/common/util/vector.h
@@ -25,25 +25,23 @@
 
 #pragma once
 
-
 /*!
  * @struct vector_t
  * @brief  C style vector for consecutive storage.
  */
 typedef struct vector
 {
-    int elem_size;                  //!< elements size which will be pushed into vector
-    int elem_num;                   //!< current counter of inserted elements
-
-    int entry_size;                 //!< size of inside vector header entry
-    int space_num;                  //!< the allocated elements counter, which should greater equal to 'elem_num'
-    int ahead_num;                  //!< allocated step when vector is full
-    void* real_mem;                 //!< real aligned memory address which point to vector entry
-    void* mem;                      //!< visual aligned address which point to the very begging of elements
-    void (*free_func)(void*);       //!< elements free function, will be called when release elements or vector
+    int elem_size; //!< elements size which will be pushed into vector
+    int elem_num;  //!< current counter of inserted elements
+
+    int entry_size;           //!< size of inside vector header entry
+    int space_num;            //!< the allocated elements counter, which should greater equal to 'elem_num'
+    int ahead_num;            //!< allocated step when vector is full
+    void* real_mem;           //!< real aligned memory address which point to vector entry
+    void* mem;                //!< visual aligned address which point to the very begging of elements
+    void (*free_func)(void*); //!< elements free function, will be called when release elements or vector
 } vector_t;
 
-
 /*!
  * @brief  Create a vector for a struct(or something else).
  *
@@ -56,7 +54,6 @@ typedef struct vector
  */
 vector_t* create_vector(int elem_size, void (*free_func)(void*));
 
-
 /*!
  * @brief  Release a vector.
  *
@@ -64,7 +61,6 @@ vector_t* create_vector(int elem_size, void (*free_func)(void*));
  */
 void release_vector(vector_t* v);
 
-
 /*!
  * @brief Get the count of elements.
  *
@@ -74,7 +70,6 @@ void release_vector(vector_t* v);
  */
 int get_vector_num(vector_t* v);
 
-
 /*!
  * @brief  Resize a vector.
  *
@@ -85,7 +80,6 @@ int get_vector_num(vector_t* v);
  */
 int resize_vector(vector_t* v, int new_size);
 
-
 /*!
  * @brief Push a element into vector from its pointer.
  *
@@ -96,7 +90,6 @@ int resize_vector(vector_t* v, int new_size);
  */
 int push_vector_data(vector_t* v, void* data);
 
-
 /*!
  * @brief Set a element via its index.
  *
@@ -108,7 +101,6 @@ int push_vector_data(vector_t* v, void* data);
  */
 int set_vector_data(vector_t* v, int index, void* data);
 
-
 /*!
  * @brief Get a element via its index.
  *
@@ -119,7 +111,6 @@ int set_vector_data(vector_t* v, int index, void* data);
  */
 void* get_vector_data(vector_t* v, int index);
 
-
 /*!
  * @brief Remove a element via its pointer.
  *
@@ -130,7 +121,6 @@ void* get_vector_data(vector_t* v, int index);
  */
 int remove_vector_via_pointer(vector_t* v, void* data);
 
-
 /*!
  * @brief Remove a element via its index.
  *
diff --git a/tests/models/test_model_alphapose.cpp b/tests/models/test_model_alphapose.cpp
index 678b33bf3..02a5a84ad 100644
--- a/tests/models/test_model_alphapose.cpp
+++ b/tests/models/test_model_alphapose.cpp
@@ -37,24 +37,24 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 320
-#define DEFAULT_IMG_W 256
-#define DEFAULT_SCALE1 (0.0039216)
-#define DEFAULT_SCALE2 (0.0039215)
-#define DEFAULT_SCALE3 (0.0039215)
-#define DEFAULT_MEAN1 0.406
-#define DEFAULT_MEAN2 0.457
-#define DEFAULT_MEAN3 0.480
+#define DEFAULT_IMG_H        320
+#define DEFAULT_IMG_W        256
+#define DEFAULT_SCALE1       (0.0039216)
+#define DEFAULT_SCALE2       (0.0039215)
+#define DEFAULT_SCALE3       (0.0039215)
+#define DEFAULT_MEAN1        0.406
+#define DEFAULT_MEAN2        0.457
+#define DEFAULT_MEAN3        0.480
 #define DEFAULT_REPEAT_COUNT 1
 #define DEFAULT_THREAD_COUNT 1
 
 const float s_keypoint_thresh = 0.2;
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -69,7 +69,7 @@ void show_usage()
     fprintf(stderr, "[Usage]:  [-h]\n    [-m model_file] [-r repeat_count] [-t thread_count]\n");
 }
 
-bool tengine_predict(float * input_data, graph_t graph, const int input_dims[4], const int & num_thread, const int & loop_count)
+bool tengine_predict(float* input_data, graph_t graph, const int input_dims[4], const int& num_thread, const int& loop_count)
 {
     /* set runtime options */
     struct options opt;
@@ -144,20 +144,20 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -198,7 +198,7 @@ int main(int argc, char* argv[])
 
     std::string model_name = "alphapose";
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data1.data(), sizeof(float), img_size, fp) == 0)
     {
@@ -219,11 +219,11 @@ int main(int argc, char* argv[])
     int heatmap_dims[MAX_SHAPE_DIM_NUM] = {0};
     get_tensor_shape(output_tensor, heatmap_dims, MAX_SHAPE_DIM_NUM);
 
-    float *data = (float *) (get_tensor_buffer(output_tensor));
+    float* data = (float*)(get_tensor_buffer(output_tensor));
     int output_size1 = get_tensor_buffer_size(output_tensor) / (sizeof(float));
     std::string reference_file1 = "./data/" + model_name + "_out.bin";
     std::vector<float> reference_data1(output_size1);
-    FILE *fp1;
+    FILE* fp1;
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
     {
@@ -233,7 +233,6 @@ int main(int argc, char* argv[])
     fclose(fp1);
     int ret1 = float_mismatch(data, reference_data1.data(), output_size1);
 
-
     /* release tengine */
     postrun_graph(graph);
     destroy_graph(graph);
diff --git a/tests/models/test_model_classification.cpp b/tests/models/test_model_classification.cpp
index caa348451..633e8f65b 100644
--- a/tests/models/test_model_classification.cpp
+++ b/tests/models/test_model_classification.cpp
@@ -36,24 +36,24 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 224
-#define DEFAULT_IMG_W 224
-#define DEFAULT_SCALE1 1.f
-#define DEFAULT_SCALE2 1.f
-#define DEFAULT_SCALE3 1.f
-#define DEFAULT_MEAN1 104.007
-#define DEFAULT_MEAN2 116.669
-#define DEFAULT_MEAN3 122.679
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        224
+#define DEFAULT_IMG_W        224
+#define DEFAULT_SCALE1       1.f
+#define DEFAULT_SCALE2       1.f
+#define DEFAULT_SCALE3       1.f
+#define DEFAULT_MEAN1        104.007
+#define DEFAULT_MEAN2        116.669
+#define DEFAULT_MEAN3        122.679
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 #define DEFAULT_CPU_AFFINITY 255
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.001)
+        if (fabs(tmp) > 0.001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -82,7 +82,7 @@ int main(int argc, char* argv[])
     int num_thread = DEFAULT_THREAD_COUNT;
     int cpu_affinity = DEFAULT_CPU_AFFINITY;
     std::string model_name;
-    std::string model_file; 
+    std::string model_file;
     char* image_file = NULL;
     float img_hw[2] = {0.f};
     int img_h = 0;
@@ -95,37 +95,37 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_name = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_h = ( int )img_hw[0];
-                img_w = ( int )img_hw[1];
-                break;
-            case 's':
-                split(scale, optarg, ",");
-                break;
-            case 'w':
-                split(mean, optarg, ",");
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_name = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_h = (int)img_hw[0];
+            img_w = (int)img_hw[1];
+            break;
+        case 's':
+            split(scale, optarg, ",");
+            break;
+        case 'w':
+            split(mean, optarg, ",");
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -202,7 +202,7 @@ int main(int argc, char* argv[])
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
+    int dims[] = {1, 3, img_h, img_w}; // nchw
     std::vector<float> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
@@ -222,7 +222,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -252,7 +252,7 @@ int main(int argc, char* argv[])
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     print_topk(output_data, output_size, 5);
@@ -261,12 +261,12 @@ int main(int argc, char* argv[])
     /* check the result */
     std::string reference_file = "./data/" + model_name + "_out.bin";
     std::vector<float> reference_data(output_size);
-    FILE *fp;
+    FILE* fp;
     fp = fopen(reference_file.c_str(), "rb");
     if (!fp)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file.c_str());
-        return -1;        
+        fprintf(stderr, "read reference %s failed!\n", reference_file.c_str());
+        return -1;
     }
     if (fread(reference_data.data(), sizeof(float), output_size, fp) == 0)
     {
@@ -282,5 +282,5 @@ int main(int argc, char* argv[])
     destroy_graph(graph);
     release_tengine();
 
-    return ret;        
+    return ret;
 }
diff --git a/tests/models/test_model_common.cpp b/tests/models/test_model_common.cpp
index 26a3076f6..1f5b2ae7c 100644
--- a/tests/models/test_model_common.cpp
+++ b/tests/models/test_model_common.cpp
@@ -71,8 +71,8 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w)
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * img_c;
-    int dims[] = {1, img_c, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, img_c, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -91,7 +91,7 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w)
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -105,7 +105,7 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w)
     {
         input_data[i] = 1.f;
     }
-    
+
     /* run graph */
     if (run_graph(graph, 1) < 0)
     {
@@ -118,7 +118,7 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w)
     {
         /* get the result of classification */
         tensor_t output_tensor = get_graph_output_tensor(graph, tensor_id, 0);
-        float* output_data = ( float* )get_tensor_buffer(output_tensor);
+        float* output_data = (float*)get_tensor_buffer(output_tensor);
         int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
         const char* tensor_name = get_tensor_name(output_tensor);
         fprintf(stderr, "test output tensor: %s begin\n", tensor_name);
@@ -133,11 +133,11 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w)
             fprintf(stderr, "open val file %s failed.\n", val_data.c_str());
             return -1;
         }
-            
+
         std::string line_str;
         char* end;
         int onnx_out_size = 1;
-        while(std::getline(f, line_str))
+        while (std::getline(f, line_str))
         {
             // std::cout << line_str << std::endl;
             if (line_str == "shape:")
@@ -157,12 +157,12 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w)
 
         float* onnx_out_data = (float*)malloc(sizeof(float) * onnx_out_size);
         int i = 0;
-        while(std::getline(f, line_str))
+        while (std::getline(f, line_str))
         {
             std::stringstream ss(line_str);
             std::string str;
             int j = 0;
-            while(getline(ss, str, ' '))
+            while (getline(ss, str, ' '))
             {
                 float tmp = strtof32(str.c_str(), &end);
                 onnx_out_data[i++] = tmp;
@@ -182,7 +182,7 @@ int onnx_model_test(std::string model_file, int img_c, int img_h, int img_w)
     }
 
     fprintf(stderr, "test model: %s pass!\n", model_file.c_str());
-    
+
     /* release tengine */
     free(input_data);
     postrun_graph(graph);
@@ -216,20 +216,20 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'g':
-                split(img_hw, optarg, ",");
-                img_c = ( int )img_hw[0];
-                img_h = ( int )img_hw[1];
-                img_w = ( int )img_hw[2];
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'g':
+            split(img_hw, optarg, ",");
+            img_c = (int)img_hw[0];
+            img_h = (int)img_hw[1];
+            img_w = (int)img_hw[2];
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
diff --git a/tests/models/test_model_crnn.cpp b/tests/models/test_model_crnn.cpp
index e280054e7..9ae20d5fa 100644
--- a/tests/models/test_model_crnn.cpp
+++ b/tests/models/test_model_crnn.cpp
@@ -40,10 +40,10 @@
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -93,23 +93,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
-    std::string model_name="crnn_lite_dense";
+    std::string model_name = "crnn_lite_dense";
     /* check files */
     if (model_file == nullptr)
     {
@@ -145,7 +145,7 @@ int main(int argc, char* argv[])
 
     int img_size = img_h * img_w * 1;
     int dims[] = {1, 1, img_h, img_w};
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -175,7 +175,7 @@ int main(int argc, char* argv[])
 
     /* prepare process input data, set the data mem to input tensor */
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data, sizeof(float), img_size, fp) == 0)
     {
@@ -208,13 +208,13 @@ int main(int argc, char* argv[])
 
     /* process the crnn result */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* ocr_data = ( float* )get_tensor_buffer(output_tensor);
+    float* ocr_data = (float*)get_tensor_buffer(output_tensor);
 
     /* check the result */
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
     std::string reference_file = "./data/" + model_name + "_out.bin";
     std::vector<float> reference_data(output_size);
-    FILE *fp1;
+    FILE* fp1;
     fp1 = fopen(reference_file.c_str(), "rb");
     if (fread(reference_data.data(), sizeof(float), output_size, fp1) == 0)
     {
@@ -225,7 +225,7 @@ int main(int argc, char* argv[])
 
     int ret = float_mismatch(ocr_data, reference_data.data(), output_size);
 
-//    process_crnn_result(ocr_data, label_file);
+    //    process_crnn_result(ocr_data, label_file);
 
     free(input_data);
     postrun_graph(graph);
diff --git a/tests/models/test_model_efficientdet.c b/tests/models/test_model_efficientdet.c
index 1a0ac9759..5ede1b1d6 100644
--- a/tests/models/test_model_efficientdet.c
+++ b/tests/models/test_model_efficientdet.c
@@ -31,24 +31,24 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 512
-#define DEFAULT_IMG_W 512
-#define DEFAULT_SCALE1 0.017124754f
-#define DEFAULT_SCALE2 0.017507003f
-#define DEFAULT_SCALE3 0.017429194f
-#define DEFAULT_MEAN1 123.675
-#define DEFAULT_MEAN2 116.280
-#define DEFAULT_MEAN3 103.530
-#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_IMG_H        512
+#define DEFAULT_IMG_W        512
+#define DEFAULT_SCALE1       0.017124754f
+#define DEFAULT_SCALE2       0.017507003f
+#define DEFAULT_SCALE3       0.017429194f
+#define DEFAULT_MEAN1        123.675
+#define DEFAULT_MEAN2        116.280
+#define DEFAULT_MEAN3        103.530
+#define DEFAULT_LOOP_COUNT   1
 #define DEFAULT_THREAD_COUNT 1
 #define DEFAULT_CPU_AFFINITY 255
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.001)
+        if (fabs(tmp) > 0.001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -59,15 +59,18 @@ int float_mismatch(float* current, float* reference, int size)
 }
 
 void repeat(const float* arr, int arr_length, int times, float offset,
-            float* result, int arr_starts_from, int arr_stride) {
+            float* result, int arr_starts_from, int arr_stride)
+{
     int length = arr_length * times;
 
-    if (result == NULL) {
+    if (result == NULL)
+    {
         result = malloc(length * sizeof(float));
         arr_starts_from = 0;
     }
 
-    for (int i = 0, j = 0; i < length; i++, j += arr_stride) {
+    for (int i = 0, j = 0; i < length; i++, j += arr_stride)
+    {
         result[j + arr_starts_from] = arr[i / times] + offset;
     }
 }
@@ -78,9 +81,9 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     int PYRAMID_LEVELS[] = {3, 4, 5, 6, 7};
     int STRIDES[] = {8, 16, 32, 64, 128};
     float SCALES[] = {
-            (float) pow(2, 0.),
-            (float) pow(2, 1. / 3.),
-            (float) pow(2, 2. / 3.),
+        (float)pow(2, 0.),
+        (float)pow(2, 1. / 3.),
+        (float)pow(2, 2. / 3.),
     };
     float RATIOS_X[] = {1.f, 1.4f, 0.7f};
     float RATIOS_Y[] = {1.f, 0.7f, 1.4f};
@@ -117,8 +120,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -132,9 +135,7 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
         fprintf(stderr, "Set input tensor shape failed\n");
         return -1;
     }
-	
 
-	
     if (set_tensor_buffer(input_tensor, input_data, img_size * sizeof(float)) < 0)
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
@@ -152,7 +153,7 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     float means[3] = {mean[0], mean[1], mean[2]};
     float scales[3] = {scale[0], scale[1], scale[2]};
     char* input_file = "./data/efficientdet_in.bin";
-    FILE *fp;
+    FILE* fp;
 
     fp = fopen(input_file, "rb");
     if (fread(input_data, sizeof(float), img_size, fp) == 0)
@@ -191,19 +192,19 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
 
     /* get the result of classification */
     tensor_t output_tensor_regression = get_graph_output_tensor(graph, 0, 0);
-    float* output_data_regression = ( float* )get_tensor_buffer(output_tensor_regression);
+    float* output_data_regression = (float*)get_tensor_buffer(output_tensor_regression);
     int num_anchors = get_tensor_buffer_size(output_tensor_regression) / sizeof(float) / 4;
 
     tensor_t output_tensor_classification = get_graph_output_tensor(graph, 1, 0);
-    float* output_data_classification = ( float* )get_tensor_buffer(output_tensor_classification);
+    float* output_data_classification = (float*)get_tensor_buffer(output_tensor_classification);
     int num_classes = get_tensor_buffer_size(output_tensor_classification) / sizeof(float) / num_anchors;
 
     // postprocess
     char* output_file1 = "./data/efficientdet_out1.bin";
     char* output_file2 = "./data/efficientdet_out2.bin";
-    float*  reference_data1 = (float*)malloc(num_anchors*sizeof(float));
-    float*  reference_data2 = (float*)malloc(num_classes*sizeof(float));
-    FILE *fp1;
+    float* reference_data1 = (float*)malloc(num_anchors * sizeof(float));
+    float* reference_data2 = (float*)malloc(num_classes * sizeof(float));
+    FILE* fp1;
     //read
     fp1 = fopen(output_file1, "rb");
     if (!fp1 || fread(reference_data1, sizeof(float), num_anchors, fp1) == 0)
@@ -221,8 +222,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
     fclose(fp1);
     int ret1 = float_mismatch(output_data_regression, reference_data1, num_anchors);
     int ret2 = float_mismatch(output_data_classification, reference_data2, num_classes);
-	
-	int ret = (ret1 | ret2 );
+
+    int ret = (ret1 | ret2);
 
     /* release tengine */
     free(input_data);
@@ -236,8 +237,8 @@ int tengine_detect(const char* model_file, const char* image_file, int img_h, in
 void show_usage()
 {
     fprintf(
-            stderr,
-            "[Usage]:  [-h]\n    [-m model_file] \n  [-r loop_count] [-t thread_count] [-a cpu_affinity]\n");
+        stderr,
+        "[Usage]:  [-h]\n    [-m model_file] \n  [-r loop_count] [-t thread_count] [-a cpu_affinity]\n");
 }
 
 int main(int argc, char* argv[])
@@ -258,23 +259,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -304,7 +305,7 @@ int main(int argc, char* argv[])
         scale[0] = DEFAULT_SCALE1;
         scale[1] = DEFAULT_SCALE2;
         scale[2] = DEFAULT_SCALE3;
-   }
+    }
 
     if (mean[0] == -1.0 || mean[1] == -1.0 || mean[2] == -1.0)
     {
diff --git a/tests/models/test_model_hrnet.cpp b/tests/models/test_model_hrnet.cpp
index 18f8aacfe..7d021808a 100644
--- a/tests/models/test_model_hrnet.cpp
+++ b/tests/models/test_model_hrnet.cpp
@@ -34,27 +34,29 @@
 
 #define DEFAULT_REPEAT_COUNT 1
 #define DEFAULT_THREAD_COUNT 1
-#define LETTERBOX_ROWS 256
-#define LETTERBOX_COLS 256
-#define MODEL_CHANNELS 3
-#define HEATMAP_CHANNEL 16
+#define LETTERBOX_ROWS       256
+#define LETTERBOX_COLS       256
+#define MODEL_CHANNELS       3
+#define HEATMAP_CHANNEL      16
 
-typedef struct {
+typedef struct
+{
     float x;
     float y;
     float score;
 } ai_point_t;
 
-struct skeleton {
+struct skeleton
+{
     int connection[2];
     int left_right_neutral;
 };
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -69,13 +71,13 @@ void show_usage()
     fprintf(stderr, "[Usage]:  [-h]\n    [-m model_file]  [-r repeat_count] [-t thread_count]\n");
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     int repeat_count = DEFAULT_REPEAT_COUNT;
     int num_thread = DEFAULT_THREAD_COUNT;
     char model_string[] = "./models/hrnet.tmfile";
-    char *model_file = model_string;
-    char *image_file = nullptr;
+    char* model_file = model_string;
+    char* image_file = nullptr;
     int img_h = LETTERBOX_COLS;
     int img_w = LETTERBOX_ROWS;
     // ai_body_parts_s pose;
@@ -88,20 +90,20 @@ int main(int argc, char *argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -116,7 +118,6 @@ int main(int argc, char *argv[])
     if (!check_file_exist(model_file))
         return -1;
 
-
     /* set runtime options */
     struct options opt;
     opt.num_thread = num_thread;
@@ -142,7 +143,7 @@ int main(int argc, char *argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
+    int dims[] = {1, 3, img_h, img_w}; // nchw
     std::vector<float> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
@@ -174,7 +175,7 @@ int main(int argc, char *argv[])
     /* prepare process input data, set the data mem to input tensor */
     std::string model_name = "hrnet";
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
@@ -207,12 +208,12 @@ int main(int argc, char *argv[])
     /* get output tensor */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
 
-    float *data = (float *) (get_tensor_buffer(output_tensor));
+    float* data = (float*)(get_tensor_buffer(output_tensor));
     int output_size1 = get_tensor_buffer_size(output_tensor) / (sizeof(float));
 
     std::string reference_file1 = "./data/" + model_name + "_out.bin";
     std::vector<float> reference_data1(output_size1);
-    FILE *fp1;
+    FILE* fp1;
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
     {
@@ -221,7 +222,7 @@ int main(int argc, char *argv[])
     }
     fclose(fp1);
     int ret1 = float_mismatch(data, reference_data1.data(), output_size1);
-    
+
     postrun_graph(graph);
     destroy_graph(graph);
     release_tengine();
diff --git a/tests/models/test_model_landmark.cpp b/tests/models/test_model_landmark.cpp
index 9ece39520..4a5f442e5 100644
--- a/tests/models/test_model_landmark.cpp
+++ b/tests/models/test_model_landmark.cpp
@@ -35,10 +35,10 @@
 #include <cmath>
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -51,7 +51,7 @@ void get_input_fp32_data(const char* image_file, float* input_data, int img_h, i
 {
     image img = imread_process(image_file, img_w, img_h, mean, scale);
 
-    float* image_data = ( float* )img.data;
+    float* image_data = (float*)img.data;
 
     for (int i = 0; i < img_w * img_h * 3; i++)
         input_data[i] = image_data[i];
@@ -80,20 +80,20 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -129,8 +129,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = (float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -159,10 +159,10 @@ int main(int argc, char* argv[])
     }
 
     /* prepare process input data, set the data mem to input tensor */
-    std::string model_name="landmark";
+    std::string model_name = "landmark";
     // get_input_fp32_data(image_file, input_data, img_h, img_w, mean, scale);
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data, sizeof(float), img_size, fp) == 0)
     {
@@ -197,13 +197,13 @@ int main(int argc, char* argv[])
     /* get output tensor */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
 
-    float* output_data = ( float* )(get_tensor_buffer(output_tensor));
-    int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float );
+    float* output_data = (float*)(get_tensor_buffer(output_tensor));
+    int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
     // save output_data
 
     std::string reference_file1 = "./data/" + model_name + "_out.bin";
     std::vector<float> reference_data(data_size);
-    FILE *fp1;
+    FILE* fp1;
     //read
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (fread(reference_data.data(), sizeof(float), data_size, fp1) == 0)
diff --git a/tests/models/test_model_mobilefacenet.cpp b/tests/models/test_model_mobilefacenet.cpp
index 38a30050d..5ff9b88ba 100644
--- a/tests/models/test_model_mobilefacenet.cpp
+++ b/tests/models/test_model_mobilefacenet.cpp
@@ -36,7 +36,7 @@
 #define DEFAULT_MEAN3 122.679
 
 #define MOBILE_FACE_HEIGHT 110
-#define MOBILE_FACE_WIDTH 110
+#define MOBILE_FACE_WIDTH  110
 
 graph_t graph;
 tensor_t input_tensor;
@@ -45,10 +45,10 @@ int feature_len;
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -86,7 +86,7 @@ int getFeature_a(const char* imagefile, float* feature)
     std::vector<float> input_data(img_size);
     std::string model_name = "mobilefacenet";
     std::string input_file = "./data/" + model_name + "_in1.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
@@ -94,7 +94,7 @@ int getFeature_a(const char* imagefile, float* feature)
         return -1;
     }
     fclose(fp);
-    
+
     set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float));
 
     if (run_graph(graph, 1) < 0)
@@ -102,7 +102,7 @@ int getFeature_a(const char* imagefile, float* feature)
         fprintf(stderr, "run_graph fail");
         return -1;
     }
-    float* data = ( float* )get_tensor_buffer(output_tensor);
+    float* data = (float*)get_tensor_buffer(output_tensor);
     int outsize;
     outsize = get_tensor_buffer_size(output_tensor) / sizeof(float);
     for (int i = 0; i < outsize; i++)
@@ -111,7 +111,7 @@ int getFeature_a(const char* imagefile, float* feature)
     // save output_data
     std::string reference_file1 = "./data/" + model_name + "_out1.bin";
     std::vector<float> reference_data1(outsize);
-    FILE *fp1;
+    FILE* fp1;
     //read
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (fread(reference_data1.data(), sizeof(float), outsize, fp1) == 0)
@@ -135,7 +135,7 @@ int getFeature_b(const char* imagefile, float* feature)
     std::vector<float> input_data(img_size);
     std::string model_name = "mobilefacenet";
     std::string input_file = "./data/" + model_name + "_in2.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
@@ -149,7 +149,7 @@ int getFeature_b(const char* imagefile, float* feature)
         fprintf(stderr, "run_graph fail");
         return -1;
     }
-    float* data = ( float* )get_tensor_buffer(output_tensor);
+    float* data = (float*)get_tensor_buffer(output_tensor);
     int outsize;
     outsize = get_tensor_buffer_size(output_tensor) / sizeof(float);
     for (int i = 0; i < outsize; i++)
@@ -158,7 +158,7 @@ int getFeature_b(const char* imagefile, float* feature)
     // save output_data
     std::string reference_file1 = "./data/" + model_name + "_out2.bin";
     std::vector<float> reference_data1(outsize);
-    FILE *fp1;
+    FILE* fp1;
     //read
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (fread(reference_data1.data(), sizeof(float), outsize, fp1) == 0)
@@ -196,14 +196,14 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -226,7 +226,7 @@ int main(int argc, char* argv[])
     int outputsizea = getFeature_a(person_a, featurea.data());
     int outputsizeb = getFeature_b(person_b, featureb.data());
 
-	int ret = (outputsizea | outputsizeb);
+    int ret = (outputsizea | outputsizeb);
     release();
     return ret;
 }
\ No newline at end of file
diff --git a/tests/models/test_model_mobilenet_ssd.c b/tests/models/test_model_mobilenet_ssd.c
index 8f65a5eda..134708761 100644
--- a/tests/models/test_model_mobilenet_ssd.c
+++ b/tests/models/test_model_mobilenet_ssd.c
@@ -29,15 +29,15 @@
 #include "tengine_operations.h"
 
 #define DEFAULT_MAX_BOX_COUNT 100
-#define DEFAULT_REPEAT_COUNT    1
-#define DEFAULT_THREAD_COUNT    1
+#define DEFAULT_REPEAT_COUNT  1
+#define DEFAULT_THREAD_COUNT  1
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -68,20 +68,20 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -117,8 +117,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -147,9 +147,9 @@ int main(int argc, char* argv[])
     }
 
     /* prepare process input data, set the data mem to input tensor */
-    char *model_name="mobilenet_ssd";
+    char* model_name = "mobilenet_ssd";
     char* input_file = "./data/mobilenet_ssd_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file, "rb");
     if (fread(input_data, sizeof(float), img_size, fp) == 0)
     {
@@ -183,15 +183,15 @@ int main(int argc, char* argv[])
     fprintf(stderr, "--------------------------------------\n");
 
     /* process the detection result */
-    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);    //"detection_out"
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); //"detection_out"
     int out_dim[4];
     get_tensor_shape(output_tensor, out_dim, 4);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
 
     int output_size1 = get_tensor_buffer_size(output_tensor) / sizeof(float);
     char* reference_file1 = "./data/mobilenet_ssd_out.bin";
-    float* reference_data1=(float* )malloc(output_size1*4);
-    FILE *fp1;
+    float* reference_data1 = (float*)malloc(output_size1 * 4);
+    FILE* fp1;
     //read
     fp1 = fopen(reference_file1, "rb");
     if (fread(reference_data1, sizeof(float), output_size1, fp1) == 0)
diff --git a/tests/models/test_model_nanodet_m.cpp b/tests/models/test_model_nanodet_m.cpp
index eb5cc9300..8fd17dc0d 100644
--- a/tests/models/test_model_nanodet_m.cpp
+++ b/tests/models/test_model_nanodet_m.cpp
@@ -36,22 +36,21 @@
 #include "tengine_operations.h"
 
 // tengine output tensor names
-const char *cls_pred_name[] = {
-    "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32"
-};
-const char *dis_pred_name[] = {
+const char* cls_pred_name[] = {
+    "cls_pred_stride_8", "cls_pred_stride_16", "cls_pred_stride_32"};
+const char* dis_pred_name[] = {
 #ifdef TRY_POST_SOFTMAX
     "dis_pred_stride_8", "dis_pred_stride_16", "dis_pred_stride_32"
-#else /* !TRY_POST_SOFTMAX */
+#else  /* !TRY_POST_SOFTMAX */
     "dis_sm_stride_8", "dis_sm_stride_16", "dis_sm_stride_32"
 #endif /* TRY_POST_SOFTMAX */
 };
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -61,15 +60,17 @@ int float_mismatch(float* current, float* reference, int size)
     return 0;
 }
 
-static void show_usage() {
+static void show_usage()
+{
     fprintf(stderr, "[Usage]: [-h]\n");
     fprintf(stderr, "   [-m model_file] [-r repeat_count] [-t thread_count] [-o output_file]\n");
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char* argv[])
+{
     const char* model_file = "./models/nanodet.tmfile";
-    const float mean[3] = { 103.53f, 116.28f, 123.675f }; // bgr
-    const float norm[3] = { 0.017429f, 0.017507f, 0.017125f };
+    const float mean[3] = {103.53f, 116.28f, 123.675f}; // bgr
+    const float norm[3] = {0.017429f, 0.017507f, 0.017125f};
 
     int repeat_count = 1;
     int num_thread = 1;
@@ -78,32 +79,36 @@ int main(int argc, char* argv[]) {
     const float nms_threshold = 0.5f;
 
     int res;
-    while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1) {
-        switch (res) {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+    while ((res = getopt(argc, argv, "m:i:o:r:t:h:")) != -1)
+    {
+        switch (res)
+        {
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
     /* check files */
-    if (nullptr == model_file) {
+    if (nullptr == model_file)
+    {
         fprintf(stderr, "Error: Tengine model file not specified!\n");
         show_usage();
         return -1;
     }
-    if (!check_file_exist(model_file)) {
+    if (!check_file_exist(model_file))
+    {
         return -1;
     }
 
@@ -115,7 +120,8 @@ int main(int argc, char* argv[]) {
     opt.affinity = 0;
 
     /* inital tengine */
-    if (0 != init_tengine()) {
+    if (0 != init_tengine())
+    {
         fprintf(stderr, "Initial tengine failed.\n");
         return -1;
     }
@@ -123,25 +129,27 @@ int main(int argc, char* argv[]) {
 
     /* create graph, load tengine model xxx.tmfile */
     graph_t graph = create_graph(nullptr, "tengine", model_file);
-    if (nullptr == graph) {
+    if (nullptr == graph)
+    {
         fprintf(stderr, "Create graph failed.\n");
         return -1;
     }
 
     /* get input tensor of graph */
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
-    if (nullptr == input_tensor) {
+    if (nullptr == input_tensor)
+    {
         fprintf(stderr, "Get input tensor failed\n");
         return -1;
     }
 
-    int img_size = 320 * 320 * 3;  // lb.w * lb.h * lb.c;
+    int img_size = 320 * 320 * 3; // lb.w * lb.h * lb.c;
 
     std::string model_name = "nanodet";
     std::string input_file = "./data/" + model_name + "_in.bin";
-    std::vector<float>input_data(img_size * sizeof(float ));
+    std::vector<float> input_data(img_size * sizeof(float));
 
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
@@ -150,12 +158,14 @@ int main(int argc, char* argv[]) {
     }
     fclose(fp);
     /* set the data mem to input tensor */
-    if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0) {
+    if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0)
+    {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
     }
     /* prerun graph to infer shape, and set work options(num_thread, cluster, precision) */
-    if (prerun_graph_multithread(graph, opt) < 0) {
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
         fprintf(stderr, "Prerun multithread graph failed.\n");
         return -1;
     }
@@ -164,9 +174,11 @@ int main(int argc, char* argv[]) {
     double min_time = DBL_MAX;
     double max_time = DBL_MIN;
     double total_time = 0.;
-    for (int i = 0; i < repeat_count; i++) {
+    for (int i = 0; i < repeat_count; i++)
+    {
         double start = get_current_time();
-        if (run_graph(graph, 1) < 0) {
+        if (run_graph(graph, 1) < 0)
+        {
             fprintf(stderr, "Run graph failed\n");
             return -1;
         }
@@ -177,30 +189,32 @@ int main(int argc, char* argv[]) {
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
-	int ret = 0;
+    int ret = 0;
     /* nanodet_m postprocess */
     // std::vector<Object> proposals, objects;
-    for (int stride_index = 0; stride_index < 3; stride_index++) {
+    for (int stride_index = 0; stride_index < 3; stride_index++)
+    {
         tensor_t cls_tensor = get_graph_tensor(graph, cls_pred_name[stride_index]);
         tensor_t dis_tensor = get_graph_tensor(graph, dis_pred_name[stride_index]);
-        if (NULL == cls_tensor || NULL ==dis_tensor) {
+        if (NULL == cls_tensor || NULL == dis_tensor)
+        {
             fprintf(stderr, "get graph tensor failed\n");
             return -1;
         }
-        float *cls_pred = (float *)get_tensor_buffer(cls_tensor);
-        float *dis_pred = (float *)get_tensor_buffer(dis_tensor);
+        float* cls_pred = (float*)get_tensor_buffer(cls_tensor);
+        float* dis_pred = (float*)get_tensor_buffer(dis_tensor);
 
         // save output_data
         int output_size1 = get_tensor_buffer_size(cls_tensor) / sizeof(float);
         int output_size2 = get_tensor_buffer_size(dis_tensor) / sizeof(float);
-        std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(stride_index*2+1) +".bin";
-        std::string reference_file2 = "./data/" + model_name + "_out" + std::to_string(stride_index*2+2) +".bin";
+        std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(stride_index * 2 + 1) + ".bin";
+        std::string reference_file2 = "./data/" + model_name + "_out" + std::to_string(stride_index * 2 + 2) + ".bin";
         std::vector<float> reference_data1(output_size1);
         std::vector<float> reference_data2(output_size2);
-        FILE *fp1;
+        FILE* fp1;
         //read
         fp1 = fopen(reference_file1.c_str(), "rb");
         if (!fp || fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
@@ -218,7 +232,7 @@ int main(int argc, char* argv[]) {
         fclose(fp1);
         int ret1 = float_mismatch(cls_pred, reference_data1.data(), output_size1);
         int ret2 = float_mismatch(dis_pred, reference_data2.data(), output_size2);
-		ret = ret | (ret1 | ret2);
+        ret = ret | (ret1 | ret2);
     }
 
     /* release tengine */
@@ -227,4 +241,3 @@ int main(int argc, char* argv[]) {
     release_tengine();
     return ret;
 }
-
diff --git a/tests/models/test_model_openpose.cpp b/tests/models/test_model_openpose.cpp
index e304e42f0..67074c78d 100644
--- a/tests/models/test_model_openpose.cpp
+++ b/tests/models/test_model_openpose.cpp
@@ -40,10 +40,10 @@
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -72,20 +72,20 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -122,9 +122,9 @@ int main(int argc, char* argv[])
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int channel = 3;
     int img_size = img_h * img_w * channel;
-    int dims[] = {1, channel, img_h, img_w};    // nchw
+    int dims[] = {1, channel, img_h, img_w}; // nchw
 
-    float* input_data = ( float* )malloc(sizeof(float) * img_size);
+    float* input_data = (float*)malloc(sizeof(float) * img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -143,7 +143,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -155,7 +155,7 @@ int main(int argc, char* argv[])
     /* prepare process input data, set the data mem to input tensor */
     std::string model_name = "openpose_coco";
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data, sizeof(float), img_size, fp) == 0)
     {
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-    float* outdata = ( float* )get_tensor_buffer(out_tensor);
+    float* outdata = (float*)get_tensor_buffer(out_tensor);
     int H = out_dim[2];
     int W = out_dim[3];
     float show_threshold = 0.1;
@@ -203,7 +203,7 @@ int main(int argc, char* argv[])
     std::string reference_file1 = "./data/" + model_name + "_out.bin";
     int output_size1 = get_tensor_buffer_size(out_tensor) / (sizeof(float));
     std::vector<float> reference_data1(output_size1);
-    FILE *fp1;
+    FILE* fp1;
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
     {
@@ -221,4 +221,3 @@ int main(int argc, char* argv[])
 
     return ret1;
 }
-
diff --git a/tests/models/test_model_retinaface.cpp b/tests/models/test_model_retinaface.cpp
index 6233ffa80..17f33b609 100644
--- a/tests/models/test_model_retinaface.cpp
+++ b/tests/models/test_model_retinaface.cpp
@@ -60,7 +60,6 @@
 
 #define MODEL_PATH "models/retinaface.tmfile"
 
-
 const float CONF_THRESH = 0.8f;
 const float NMS_THRESH = 0.4f;
 
@@ -76,13 +75,12 @@ const int stride[3] = {32, 16, 8};
 
 const float scales[3][2] = {{32.f, 16.f}, {8.f, 4.f}, {2.f, 1.f}};
 
-
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -92,7 +90,6 @@ int float_mismatch(float* current, float* reference, int size)
     return 0;
 }
 
-
 void show_usage()
 {
     printf("[Usage]:  [-h]\n    [-m model_file]  [-r repeat_count] [-t thread_count] [-n device_name]\n");
@@ -112,23 +109,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'n':
-                device_name = optarg;
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'n':
+            device_name = optarg;
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -140,7 +137,6 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-
     if (!check_file_exist(model_file))
         return -1;
 
@@ -149,7 +145,7 @@ int main(int argc, char* argv[])
     opt.num_thread = num_thread;
     opt.cluster = TENGINE_CLUSTER_ALL;
     opt.precision = TENGINE_MODE_FP32;
-    opt.affinity = 0;       
+    opt.affinity = 0;
 
     /* inital tengine */
     int ret = init_tengine();
@@ -175,19 +171,18 @@ int main(int argc, char* argv[])
     int img_size = height * width * 3;
     std::vector<float> image_data(img_size * sizeof(float));
 
-
     std::string model_name = "retinaface";
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
-    if (!fp )
+    if (!fp)
     {
-        fprintf(stderr, "open input file %s failed!\n",input_file.c_str());
+        fprintf(stderr, "open input file %s failed!\n", input_file.c_str());
         return -1;
     }
     if (!fp || fread(image_data.data(), sizeof(float), img_size, fp) == 0)
     {
-        fprintf(stderr, "read input file %s failed!\n",input_file.c_str());
+        fprintf(stderr, "read input file %s failed!\n", input_file.c_str());
         return -1;
     }
     fclose(fp);
@@ -214,7 +209,7 @@ int main(int argc, char* argv[])
     {
         printf("Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (0 != prerun_graph_multithread(graph, opt))
@@ -242,7 +237,7 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     printf("Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count,
-           num_thread, total_time / ( float )repeat_count, max_time, min_time);
+           num_thread, total_time / (float)repeat_count, max_time, min_time);
     printf("--------------------------------------\n");
 
     /* process the detection result */
@@ -264,41 +259,41 @@ int main(int argc, char* argv[])
         get_tensor_shape(bbox_blob_tensor, bbox_blob_dims, MAX_SHAPE_DIM_NUM);
         get_tensor_shape(landmark_blob_tensor, landmark_blob_dims, MAX_SHAPE_DIM_NUM);
 
-        float* score_blob = ( float* )get_tensor_buffer(score_blob_tensor);
-        float* bbox_blob = ( float* )get_tensor_buffer(bbox_blob_tensor);
-        float* landmark_blob = ( float* )get_tensor_buffer(landmark_blob_tensor);
+        float* score_blob = (float*)get_tensor_buffer(score_blob_tensor);
+        float* bbox_blob = (float*)get_tensor_buffer(bbox_blob_tensor);
+        float* landmark_blob = (float*)get_tensor_buffer(landmark_blob_tensor);
 
         // save output_data
         int output_size1 = get_tensor_buffer_size(score_blob_tensor) / sizeof(float);
         int output_size2 = get_tensor_buffer_size(bbox_blob_tensor) / sizeof(float);
         int output_size3 = get_tensor_buffer_size(landmark_blob_tensor) / sizeof(float);
-        std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(stride_index*3+1) +".bin";
-        std::string reference_file2 = "./data/" + model_name + "_out" + std::to_string(stride_index*3+2) +".bin";
-        std::string reference_file3 = "./data/" + model_name + "_out" + std::to_string(stride_index*3+3) +".bin";
+        std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(stride_index * 3 + 1) + ".bin";
+        std::string reference_file2 = "./data/" + model_name + "_out" + std::to_string(stride_index * 3 + 2) + ".bin";
+        std::string reference_file3 = "./data/" + model_name + "_out" + std::to_string(stride_index * 3 + 3) + ".bin";
         std::vector<float> reference_data1(output_size1);
         std::vector<float> reference_data2(output_size2);
         std::vector<float> reference_data3(output_size3);
-        FILE *fp1;
+        FILE* fp1;
 
         //read
         fp1 = fopen(reference_file1.c_str(), "rb");
         if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
         {
-            fprintf(stderr, "read reference %s failed!\n",reference_file1.c_str());
+            fprintf(stderr, "read reference %s failed!\n", reference_file1.c_str());
             return -1;
         }
         fclose(fp1);
         fp1 = fopen(reference_file2.c_str(), "rb");
         if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0)
         {
-            fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str());
+            fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str());
             return -1;
         }
         fclose(fp1);
         fp1 = fopen(reference_file3.c_str(), "rb");
         if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0)
         {
-            fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str());
+            fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str());
             return -1;
         }
         fclose(fp1);
diff --git a/tests/models/test_model_ultraface.cpp b/tests/models/test_model_ultraface.cpp
index c2b9727a4..2c7991459 100644
--- a/tests/models/test_model_ultraface.cpp
+++ b/tests/models/test_model_ultraface.cpp
@@ -31,12 +31,12 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_REPEAT_COUNT    1
-#define DEFAULT_THREAD_COUNT    1
-#define num_featuremap 4
-#define hard_nms 1
-#define blending_nms 2 /* mix nms was been proposaled in paper blaze face, aims to minimize the temporal jitter*/
-#define clip(x, y) (x < 0 ? 0 : (x > y ? y : x))
+#define DEFAULT_REPEAT_COUNT 1
+#define DEFAULT_THREAD_COUNT 1
+#define num_featuremap       4
+#define hard_nms             1
+#define blending_nms         2 /* mix nms was been proposaled in paper blaze face, aims to minimize the temporal jitter*/
+#define clip(x, y)           (x < 0 ? 0 : (x > y ? y : x))
 
 typedef struct FaceInfo
 {
@@ -57,10 +57,10 @@ const float g_center_variance = 0.1f;
 const float g_size_variance = 0.2f;
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -88,20 +88,20 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
     std::string model_name = "version-RFB-320_simplified";
@@ -114,7 +114,6 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-
     if (!check_file_exist(model_file))
         return -1;
 
@@ -139,8 +138,8 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = g_tensor_in_h * g_tensor_in_w * 3;
-    int dims[] = {1, 3, g_tensor_in_h, g_tensor_in_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, g_tensor_in_h, g_tensor_in_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -159,7 +158,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -171,7 +170,7 @@ int main(int argc, char* argv[])
     /* prepare process input data, set the data mem to input tensor */
     //save input
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
 
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data, sizeof(float), img_size, fp) == 0)
@@ -181,7 +180,6 @@ int main(int argc, char* argv[])
     }
     fclose(fp);
 
-
     /* run graph */
     double min_time = DBL_MAX;
     double max_time = DBL_MIN;
@@ -210,8 +208,8 @@ int main(int argc, char* argv[])
     tensor_t boxs_tensor = get_graph_output_tensor(graph, 0, 0);
     tensor_t scores_tensor = get_graph_output_tensor(graph, 1, 0);
 
-    float* boxs_data = (float* )get_tensor_buffer(boxs_tensor);
-    float* scores_data = (float* )get_tensor_buffer(scores_tensor);
+    float* boxs_data = (float*)get_tensor_buffer(boxs_tensor);
+    float* scores_data = (float*)get_tensor_buffer(scores_tensor);
 
     // save output_data
     int output_size1 = get_tensor_buffer_size(boxs_tensor) / sizeof(float);
@@ -220,7 +218,7 @@ int main(int argc, char* argv[])
     std::string reference_file2 = "./data/" + model_name + "_out2.bin";
     std::vector<float> reference_data1(output_size1);
     std::vector<float> reference_data2(output_size2);
-    FILE *fp1;
+    FILE* fp1;
     //write
 
     //read
@@ -252,4 +250,3 @@ int main(int argc, char* argv[])
 
     return ret;
 }
-
diff --git a/tests/models/test_model_unet.cpp b/tests/models/test_model_unet.cpp
index 2380d5081..71df233c0 100644
--- a/tests/models/test_model_unet.cpp
+++ b/tests/models/test_model_unet.cpp
@@ -35,17 +35,17 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-#define DEFAULT_IMG_H 512 
-#define DEFAULT_IMG_W 512
-#define DEFAULT_SCALE1 (1.f/255.f)
-#define DEFAULT_SCALE2 (1.f/255.f)
-#define DEFAULT_SCALE3 (1.f/255.f)
-#define DEFAULT_MEAN1 0
-#define DEFAULT_MEAN2 0
-#define DEFAULT_MEAN3 0
-#define DEFAULT_LOOP_COUNT 1
-#define DEFAULT_THREAD_COUNT 1
-#define DEFAULT_CPU_AFFINITY 255
+#define DEFAULT_IMG_H          512
+#define DEFAULT_IMG_W          512
+#define DEFAULT_SCALE1         (1.f / 255.f)
+#define DEFAULT_SCALE2         (1.f / 255.f)
+#define DEFAULT_SCALE3         (1.f / 255.f)
+#define DEFAULT_MEAN1          0
+#define DEFAULT_MEAN2          0
+#define DEFAULT_MEAN3          0
+#define DEFAULT_LOOP_COUNT     1
+#define DEFAULT_THREAD_COUNT   1
+#define DEFAULT_CPU_AFFINITY   255
 #define DEFAULT_CONF_THRESHOLD 0.5f
 
 /**
@@ -56,10 +56,10 @@
  */
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -70,7 +70,7 @@ int float_mismatch(float* current, float* reference, int size)
 }
 
 int tengine_segment(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean,
-                     const float* scale, int loop_count, int num_thread, int affinity, float conf_thresh)
+                    const float* scale, int loop_count, int num_thread, int affinity, float conf_thresh)
 {
     /* set runtime options */
     struct options opt;
@@ -97,8 +97,8 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * 3;
-    int dims[] = {1, 3, img_h, img_w};    // nchw
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == NULL)
@@ -117,7 +117,7 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -129,7 +129,7 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i
     /* prepare process input data, set the data mem to input tensor */
     std::string model_name = "unet";
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
 
     fp = fopen(input_file.c_str(), "rb");
     if (!fp || fread(input_data, sizeof(float), img_size, fp) == 0)
@@ -168,12 +168,12 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i
 
     /* get the result of classification */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     std::string reference_file1 = "./data/" + model_name + "_out.bin";
     std::vector<float> reference_data1(output_size);
-    FILE *fp1;
+    FILE* fp1;
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (!fp || fread(reference_data1.data(), sizeof(float), output_size, fp1) == 0)
     {
@@ -182,9 +182,9 @@ int tengine_segment(const char* model_file, const char* image_file, int img_h, i
     }
     fclose(fp1);
     int ret1 = float_mismatch(output_data, reference_data1.data(), output_size);
-      /* single class segmentation */
-      /* multi-class segmentation */
-      /* visualization */
+    /* single class segmentation */
+    /* multi-class segmentation */
+    /* visualization */
 
     /* release tengine */
     free(input_data);
@@ -222,26 +222,26 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                loop_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'a':
-                cpu_affinity = atoi(optarg);
-                break;
-            case 'c':
-                conf_thresh = atof(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            loop_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'a':
+            cpu_affinity = atoi(optarg);
+            break;
+        case 'c':
+            conf_thresh = atof(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -253,26 +253,23 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-
     if (!check_file_exist(model_file))
         return -1;
 
-        img_h = DEFAULT_IMG_H;
+    img_h = DEFAULT_IMG_H;
 
-        img_w = DEFAULT_IMG_W;
+    img_w = DEFAULT_IMG_W;
 
-        scale[0] = DEFAULT_SCALE1;
-        scale[1] = DEFAULT_SCALE2;
-        scale[2] = DEFAULT_SCALE3;
-
-        mean[0] = DEFAULT_MEAN1;
-        mean[1] = DEFAULT_MEAN2;
-        mean[2] = DEFAULT_MEAN3;
+    scale[0] = DEFAULT_SCALE1;
+    scale[1] = DEFAULT_SCALE2;
+    scale[2] = DEFAULT_SCALE3;
 
+    mean[0] = DEFAULT_MEAN1;
+    mean[1] = DEFAULT_MEAN2;
+    mean[2] = DEFAULT_MEAN3;
 
     if (tengine_segment(model_file, image_file, img_h, img_w, mean, scale, loop_count, num_thread, cpu_affinity, conf_thresh) < 0)
         return -1;
 
     return 0;
 }
-
diff --git a/tests/models/test_model_yolact.cpp b/tests/models/test_model_yolact.cpp
index 20af24355..83245f2f7 100644
--- a/tests/models/test_model_yolact.cpp
+++ b/tests/models/test_model_yolact.cpp
@@ -53,10 +53,10 @@
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -83,20 +83,20 @@ int main(int argc, char** argv)
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -138,7 +138,7 @@ int main(int argc, char** argv)
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = target_size * target_size * 3;
-    int dims[] = {1, 3, target_size, target_size};    // nchw
+    int dims[] = {1, 3, target_size, target_size}; // nchw
     std::vector<float> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
@@ -158,7 +158,7 @@ int main(int argc, char** argv)
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -170,7 +170,7 @@ int main(int argc, char** argv)
     /* prepare process input data, set the data mem to input tensor */
     std::string model_name = "yolact";
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
@@ -206,10 +206,10 @@ int main(int argc, char** argv)
     tensor_t location_tensor = get_graph_output_tensor(graph, 2, 0);
     tensor_t mask_tensor = get_graph_output_tensor(graph, 3, 0);
     tensor_t confidence_tensor = get_graph_output_tensor(graph, 4, 0);
-    float* maskmaps = ( float* )get_tensor_buffer(maskmaps_tensor);
-    float* location = ( float* )get_tensor_buffer(location_tensor);
-    float* mask = ( float* )get_tensor_buffer(mask_tensor);
-    float* confidence = ( float* )get_tensor_buffer(confidence_tensor);
+    float* maskmaps = (float*)get_tensor_buffer(maskmaps_tensor);
+    float* location = (float*)get_tensor_buffer(location_tensor);
+    float* mask = (float*)get_tensor_buffer(mask_tensor);
+    float* confidence = (float*)get_tensor_buffer(confidence_tensor);
 
     // save output_data
     int output_size1 = get_tensor_buffer_size(maskmaps_tensor) / sizeof(float);
@@ -224,32 +224,32 @@ int main(int argc, char** argv)
     std::vector<float> reference_data2(output_size2);
     std::vector<float> reference_data3(output_size3);
     std::vector<float> reference_data4(output_size4);
-    FILE *fp1;
+    FILE* fp1;
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
     {
-        fprintf(stderr, "read %s data failed!\n",reference_file1.c_str());
+        fprintf(stderr, "read %s data failed!\n", reference_file1.c_str());
         return -1;
     }
     fclose(fp1);
     fp1 = fopen(reference_file2.c_str(), "rb");
     if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0)
     {
-        fprintf(stderr, "read %s data failed!\n",reference_file2.c_str());
+        fprintf(stderr, "read %s data failed!\n", reference_file2.c_str());
         return -1;
     }
     fclose(fp1);
     fp1 = fopen(reference_file3.c_str(), "rb");
     if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0)
     {
-        fprintf(stderr, "read %s data failed!\n",reference_file3.c_str());
+        fprintf(stderr, "read %s data failed!\n", reference_file3.c_str());
         return -1;
     }
     fclose(fp1);
     fp1 = fopen(reference_file4.c_str(), "rb");
     if (fread(reference_data4.data(), sizeof(float), output_size4, fp1) == 0)
     {
-        fprintf(stderr, "read %s data failed!\n",reference_file4.c_str());
+        fprintf(stderr, "read %s data failed!\n", reference_file4.c_str());
         return -1;
     }
     fclose(fp1);
@@ -267,4 +267,3 @@ int main(int argc, char** argv)
 
     return ret;
 }
-
diff --git a/tests/models/test_model_yolofastest.cpp b/tests/models/test_model_yolofastest.cpp
index 1f90d05de..a67fb1e4b 100644
--- a/tests/models/test_model_yolofastest.cpp
+++ b/tests/models/test_model_yolofastest.cpp
@@ -23,7 +23,7 @@
  * 
  * original model: https://github.com/dog-qiuqiu/Yolo-Fastest/tree/master/ModelZoo/yolo-fastest-1.1_coco
  */
- 
+
 #include <iostream>
 #include <iomanip>
 #include <vector>
@@ -44,10 +44,10 @@
 #define DEFAULT_THREAD_COUNT 1
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -76,20 +76,20 @@ struct TMat
         return (const float*)data;
     }
 
-    float *row(int row) const
+    float* row(int row) const
     {
-        return (float *)data + w * row;
+        return (float*)data + w * row;
     }
 
-    TMat channel_range(int start, int chn_num) const 
+    TMat channel_range(int start, int chn_num) const
     {
-        TMat mat = { 0 };
+        TMat mat = {0};
 
         mat.batch = 1;
         mat.c = chn_num;
         mat.h = h;
         mat.w = w;
-        mat.data = (float *)data + start * h * w;
+        mat.data = (float*)data + start * h * w;
 
         return mat;
     }
@@ -100,7 +100,7 @@ struct TMat
     }
 
     int batch, c, h, w;
-    void *data;
+    void* data;
 };
 
 int main(int argc, char* argv[])
@@ -143,7 +143,7 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-   if (!check_file_exist(model_file))
+    if (!check_file_exist(model_file))
         return -1;
 
     /* set runtime options */
@@ -171,7 +171,7 @@ int main(int argc, char* argv[])
 
     /* set the input shape to initial the graph, and prerun graph to infer shape */
     int img_size = net_h * net_w * 3;
-    int dims[] = { 1, 3, net_h, net_w };    // nchw
+    int dims[] = {1, 3, net_h, net_w}; // nchw
 
     std::vector<float> input_data(img_size);
 
@@ -202,11 +202,11 @@ int main(int argc, char* argv[])
     }
 
     /* prepare process input data, set the data mem to input tensor */
-    std::string model_name="yolo-fastest-1.1";
+    std::string model_name = "yolo-fastest-1.1";
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
-    if (!fp ||fread(input_data.data(), sizeof(float), img_size, fp) == 0)
+    if (!fp || fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
         fprintf(stderr, "read input data file failed!\n");
         return -1;
@@ -232,24 +232,25 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count,
-        num_thread, total_time / repeat_count, max_time, min_time);
+            num_thread, total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* process the detection result */
 
     int output_node_num = get_graph_output_node_number(graph);
     int ret1 = 0;
-	tensor_t out_tensor;
+    tensor_t out_tensor;
     for (int i = 0; i < output_node_num; ++i)
     {
-        out_tensor = get_graph_output_tensor(graph, i, 0);    //"detection_out"
+        out_tensor = get_graph_output_tensor(graph, i, 0); //"detection_out"
         // save output_data
         std::string model_name = "yolo-fastest-1.1";
-        int output_size1 = get_tensor_buffer_size(out_tensor) / sizeof(float);;
+        int output_size1 = get_tensor_buffer_size(out_tensor) / sizeof(float);
+        ;
         float* yolo_outputs = (float*)get_tensor_buffer(out_tensor);
-        std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(i+1)+".bin";
+        std::string reference_file1 = "./data/" + model_name + "_out" + std::to_string(i + 1) + ".bin";
         std::vector<float> reference_data1(output_size1);
-        FILE *fp1;
+        FILE* fp1;
         //read
         fp1 = fopen(reference_file1.c_str(), "rb");
         if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
diff --git a/tests/models/test_model_yolov3.cpp b/tests/models/test_model_yolov3.cpp
index fb4a459bd..447f32a4f 100644
--- a/tests/models/test_model_yolov3.cpp
+++ b/tests/models/test_model_yolov3.cpp
@@ -35,10 +35,10 @@
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.001)
+        if (fabs(tmp) > 0.001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -119,11 +119,11 @@ int main(int argc, char* argv[])
     /* prepare process input data, set the data mem to input tensor */
     // read input_data
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (!fp || fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
-        fprintf(stderr, "read input data file %s failed!\n",input_file.c_str());
+        fprintf(stderr, "read input data file %s failed!\n", input_file.c_str());
         return -1;
     }
     fclose(fp);
@@ -140,13 +140,13 @@ int main(int argc, char* argv[])
     fprintf(stderr, "Inference time %.2f ms\n", end - start);
     fprintf(stderr, "--------------------------------------\n");
 
-    tensor_t p8_output  = get_graph_output_tensor(graph, 2, 0);
+    tensor_t p8_output = get_graph_output_tensor(graph, 2, 0);
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 0, 0);
-    
-    float* p8_data  = ( float*)get_tensor_buffer(p8_output);
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
+
+    float* p8_data = (float*)get_tensor_buffer(p8_output);
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
 
     /* check the result */
     int output_size1 = get_tensor_buffer_size(p8_output) / sizeof(float);
@@ -155,26 +155,26 @@ int main(int argc, char* argv[])
     std::string reference_file1 = "./data/" + model_name + "_out1.bin";
     std::string reference_file2 = "./data/" + model_name + "_out2.bin";
     std::string reference_file3 = "./data/" + model_name + "_out3.bin";
-    std::vector<float> reference_data1(output_size1),reference_data2(output_size2),reference_data3(output_size3);
-    FILE *fp1;
+    std::vector<float> reference_data1(output_size1), reference_data2(output_size2), reference_data3(output_size3);
+    FILE* fp1;
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (!fp1 || fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file1.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file1.c_str());
         return -1;
     }
     fclose(fp1);
     fp1 = fopen(reference_file2.c_str(), "rb");
     if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str());
         return -1;
     }
     fclose(fp1);
     fp1 = fopen(reference_file3.c_str(), "rb");
     if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str());
         return -1;
     }
     fclose(fp1);
diff --git a/tests/models/test_model_yolov3_tiny.cpp b/tests/models/test_model_yolov3_tiny.cpp
index 9654b2bc2..da48cc6e5 100644
--- a/tests/models/test_model_yolov3_tiny.cpp
+++ b/tests/models/test_model_yolov3_tiny.cpp
@@ -35,10 +35,10 @@
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -48,7 +48,6 @@ int float_mismatch(float* current, float* reference, int size)
     return 0;
 }
 
-
 void show_usage()
 {
     fprintf(
@@ -58,7 +57,7 @@ void show_usage()
 
 int main(int argc, char* argv[])
 {
-    const char* model_file ="./models/yolov3-tiny.tmfile";
+    const char* model_file = "./models/yolov3-tiny.tmfile";
     int img_h = 416;
     int img_w = 416;
     int img_c = 3;
@@ -73,23 +72,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
-    std::string model_name="yolov3_tiny";
+    std::string model_name = "yolov3_tiny";
     /* check files */
     if (nullptr == model_file)
     {
@@ -98,11 +97,9 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-
     if (!check_file_exist(model_file))
         return -1;
 
-
     /* set runtime options */
     struct options opt;
     opt.num_thread = num_thread;
@@ -128,7 +125,7 @@ int main(int argc, char* argv[])
 
     int img_size = img_h * img_w * img_c;
     int dims[] = {1, 3, img_h, img_w};
-    float* input_data = ( float* )malloc(img_size * sizeof(float));
+    float* input_data = (float*)malloc(img_size * sizeof(float));
 
     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
     if (input_tensor == nullptr)
@@ -159,12 +156,12 @@ int main(int argc, char* argv[])
     /* prepare process input data, set the data mem to input tensor */
     // save input_data
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
 
     fp = fopen(input_file.c_str(), "rb");
     if (!fp || fread(input_data, sizeof(float), img_size, fp) == 0)
     {
-        fprintf(stderr, "read input data file %s failed!\n",input_file.c_str());
+        fprintf(stderr, "read input data file %s failed!\n", input_file.c_str());
         return -1;
     }
     fclose(fp);
@@ -188,33 +185,33 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 0, 0);
-    
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
+
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
 
     int output_size2 = get_tensor_buffer_size(p16_output) / sizeof(float);
     int output_size3 = get_tensor_buffer_size(p32_output) / sizeof(float);
 
     std::string reference_file2 = "./data/" + model_name + "_out1.bin";
     std::string reference_file3 = "./data/" + model_name + "_out2.bin";
-    std::vector<float> reference_data2(output_size2),reference_data3(output_size3);
-    FILE *fp1;
+    std::vector<float> reference_data2(output_size2), reference_data3(output_size3);
+    FILE* fp1;
     fp1 = fopen(reference_file2.c_str(), "rb");
     if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str());
         return -1;
     }
     fclose(fp1);
     fp1 = fopen(reference_file3.c_str(), "rb");
     if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str());
         return -1;
     }
     fclose(fp1);
@@ -223,7 +220,6 @@ int main(int argc, char* argv[])
     int ret3 = float_mismatch(p32_data, reference_data3.data(), output_size3);
 
     /* postprocess */
-    
 
     /* release tengine */
     postrun_graph(graph);
diff --git a/tests/models/test_model_yolov4.cpp b/tests/models/test_model_yolov4.cpp
index ddb40a513..83f776001 100644
--- a/tests/models/test_model_yolov4.cpp
+++ b/tests/models/test_model_yolov4.cpp
@@ -37,10 +37,10 @@
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -50,7 +50,6 @@ int float_mismatch(float* current, float* reference, int size)
     return 0;
 }
 
-
 void show_usage()
 {
     fprintf(
@@ -60,7 +59,7 @@ void show_usage()
 
 int main(int argc, char* argv[])
 {
-    const char* model_file ="./models/yolov4.tmfile";
+    const char* model_file = "./models/yolov4.tmfile";
     int img_h = 416;
     int img_w = 416;
     int img_c = 3;
@@ -75,23 +74,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
-    std::string model_name="yolov4";
+    std::string model_name = "yolov4";
     /* check files */
     if (nullptr == model_file)
     {
@@ -100,11 +99,9 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-
     if (!check_file_exist(model_file))
         return -1;
 
-
     /* set runtime options */
     struct options opt;
     opt.num_thread = num_thread;
@@ -161,12 +158,12 @@ int main(int argc, char* argv[])
     /* prepare process input data, set the data mem to input tensor */
     // read input_data
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
-	
+    FILE* fp;
+
     fp = fopen(input_file.c_str(), "rb");
     if (!fp || fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
-        fprintf(stderr, "read input data file %s failed!\n",input_file.c_str());
+        fprintf(stderr, "read input data file %s failed!\n", input_file.c_str());
         return -1;
     }
     fclose(fp);
@@ -190,30 +187,21 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     tensor_t p8_output = get_graph_output_tensor(graph, 0, 0);
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 2, 0);
-    
-    float* p8_data = ( float*)get_tensor_buffer(p8_output);
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
-
-	/* postprocess */
-
 
+    float* p8_data = (float*)get_tensor_buffer(p8_output);
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
 
+    /* postprocess */
 
     /* yolov4 tiny draw the result */
 
-
-
-
-
-
-	
     /* check the result */
     int output_size1 = get_tensor_buffer_size(p8_output) / sizeof(float);
     int output_size2 = get_tensor_buffer_size(p16_output) / sizeof(float);
@@ -221,27 +209,27 @@ int main(int argc, char* argv[])
     std::string reference_file1 = "./data/" + model_name + "_out1.bin";
     std::string reference_file2 = "./data/" + model_name + "_out2.bin";
     std::string reference_file3 = "./data/" + model_name + "_out3.bin";
-    std::vector<float> reference_data1(output_size1),reference_data2(output_size2),reference_data3(output_size3);
-    FILE *fp1;
+    std::vector<float> reference_data1(output_size1), reference_data2(output_size2), reference_data3(output_size3);
+    FILE* fp1;
 
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (!fp1 || fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file1.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file1.c_str());
         return -1;
     }
     fclose(fp1);
     fp1 = fopen(reference_file2.c_str(), "rb");
     if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str());
         return -1;
     }
     fclose(fp1);
     fp1 = fopen(reference_file3.c_str(), "rb");
     if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str());
         return -1;
     }
     fclose(fp1);
diff --git a/tests/models/test_model_yolov4_tiny.cpp b/tests/models/test_model_yolov4_tiny.cpp
index da07ba8bc..1679a0090 100644
--- a/tests/models/test_model_yolov4_tiny.cpp
+++ b/tests/models/test_model_yolov4_tiny.cpp
@@ -37,10 +37,10 @@
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -50,7 +50,6 @@ int float_mismatch(float* current, float* reference, int size)
     return 0;
 }
 
-
 void show_usage()
 {
     fprintf(
@@ -60,7 +59,7 @@ void show_usage()
 
 int main(int argc, char* argv[])
 {
-    const char* model_file ="./models/yolov4-tiny.tmfile";
+    const char* model_file = "./models/yolov4-tiny.tmfile";
     int img_h = 416;
     int img_w = 416;
     int img_c = 3;
@@ -75,23 +74,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
-    std::string model_name="yolov4_tiny";
+    std::string model_name = "yolov4_tiny";
     /* check files */
     if (nullptr == model_file)
     {
@@ -100,11 +99,9 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-
     if (!check_file_exist(model_file))
         return -1;
 
-
     /* set runtime options */
     struct options opt;
     opt.num_thread = num_thread;
@@ -161,12 +158,12 @@ int main(int argc, char* argv[])
     /* prepare process input data, set the data mem to input tensor */
     // save input_data
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
 
     fp = fopen(input_file.c_str(), "rb");
     if (!fp || fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
-        fprintf(stderr, "read input data file %s failed!\n",input_file.c_str());
+        fprintf(stderr, "read input data file %s failed!\n", input_file.c_str());
         return -1;
     }
     fclose(fp);
@@ -190,41 +187,39 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
-
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 0, 0);
 
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
 
     int output_size2 = get_tensor_buffer_size(p16_output) / sizeof(float);
     int output_size3 = get_tensor_buffer_size(p32_output) / sizeof(float);
     std::string reference_file2 = "./data/" + model_name + "_out1.bin";
     std::string reference_file3 = "./data/" + model_name + "_out2.bin";
-    std::vector<float> reference_data2(output_size2),reference_data3(output_size3);
-    FILE *fp1;
+    std::vector<float> reference_data2(output_size2), reference_data3(output_size3);
+    FILE* fp1;
     fp1 = fopen(reference_file2.c_str(), "rb");
     if (fread(reference_data2.data(), sizeof(float), output_size2, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file2.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file2.c_str());
         return -1;
     }
     fclose(fp1);
     fp1 = fopen(reference_file3.c_str(), "rb");
     if (fread(reference_data3.data(), sizeof(float), output_size3, fp1) == 0)
     {
-        fprintf(stderr, "read reference %s failed!\n",reference_file3.c_str());
+        fprintf(stderr, "read reference %s failed!\n", reference_file3.c_str());
         return -1;
     }
     fclose(fp1);
     int ret2 = float_mismatch(p16_data, reference_data2.data(), output_size2);
     int ret3 = float_mismatch(p32_data, reference_data3.data(), output_size3);
 
-	/* postprocess */
-    
+    /* postprocess */
 
     /* release tengine */
     postrun_graph(graph);
diff --git a/tests/models/test_model_yolov5s.cpp b/tests/models/test_model_yolov5s.cpp
index 288d75388..fad8f98ca 100644
--- a/tests/models/test_model_yolov5s.cpp
+++ b/tests/models/test_model_yolov5s.cpp
@@ -35,10 +35,10 @@
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
@@ -51,8 +51,8 @@ int float_mismatch(float* current, float* reference, int size)
 void show_usage()
 {
     fprintf(
-            stderr,
-            "[Usage]:  [-h]\n    [-m model_file] [-r repeat_count] [-t thread_count]\n");
+        stderr,
+        "[Usage]:  [-h]\n    [-m model_file] [-r repeat_count] [-t thread_count]\n");
 }
 
 int main(int argc, char* argv[])
@@ -74,20 +74,20 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'r':
-                repeat_count = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 't':
-                num_thread = std::strtoul(optarg, nullptr, 10);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'r':
+            repeat_count = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 't':
+            num_thread = std::strtoul(optarg, nullptr, 10);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -158,7 +158,7 @@ int main(int argc, char* argv[])
     /* prepare process input data, set the data mem to input tensor */
     std::string model_name = "yolov5s";
     std::string input_file = "./data/" + model_name + "_in.bin";
-    FILE *fp;
+    FILE* fp;
     fp = fopen(input_file.c_str(), "rb");
     if (fread(input_data.data(), sizeof(float), img_size, fp) == 0)
     {
@@ -186,7 +186,7 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* yolov5 postprocess */
@@ -197,19 +197,19 @@ int main(int argc, char* argv[])
     tensor_t p16_output = get_graph_output_tensor(graph, 1, 0);
     tensor_t p32_output = get_graph_output_tensor(graph, 2, 0);
 
-    float* p8_data = ( float*)get_tensor_buffer(p8_output);
-    float* p16_data = ( float*)get_tensor_buffer(p16_output);
-    float* p32_data = ( float*)get_tensor_buffer(p32_output);
+    float* p8_data = (float*)get_tensor_buffer(p8_output);
+    float* p16_data = (float*)get_tensor_buffer(p16_output);
+    float* p32_data = (float*)get_tensor_buffer(p32_output);
 
     /* postprocess */
-     int output_size1 = get_tensor_buffer_size(p8_output) / sizeof(float);
+    int output_size1 = get_tensor_buffer_size(p8_output) / sizeof(float);
     int output_size2 = get_tensor_buffer_size(p16_output) / sizeof(float);
     int output_size3 = get_tensor_buffer_size(p32_output) / sizeof(float);
     std::string reference_file1 = "./data/" + model_name + "_out1.bin";
     std::string reference_file2 = "./data/" + model_name + "_out2.bin";
     std::string reference_file3 = "./data/" + model_name + "_out3.bin";
-    std::vector<float> reference_data1(output_size1),reference_data2(output_size2),reference_data3(output_size3);
-    FILE *fp1;
+    std::vector<float> reference_data1(output_size1), reference_data2(output_size2), reference_data3(output_size3);
+    FILE* fp1;
     fp1 = fopen(reference_file1.c_str(), "rb");
     if (fread(reference_data1.data(), sizeof(float), output_size1, fp1) == 0)
     {
@@ -242,4 +242,3 @@ int main(int argc, char* argv[])
     release_tengine();
     return ret;
 }
-
diff --git a/tests/models/test_timvx_model_yolov5s.cpp b/tests/models/test_timvx_model_yolov5s.cpp
index ecd704a67..509148612 100644
--- a/tests/models/test_timvx_model_yolov5s.cpp
+++ b/tests/models/test_timvx_model_yolov5s.cpp
@@ -37,7 +37,6 @@
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
 
-
 struct Object
 {
     cv::Rect_<float> rect;
@@ -135,8 +134,7 @@ static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vecto
     }
 }
 
-
-static void generate_proposals(int stride,  const float* feat, float prob_threshold, std::vector<Object>& objects,
+static void generate_proposals(int stride, const float* feat, float prob_threshold, std::vector<Object>& objects,
                                int letterbox_cols, int letterbox_rows)
 {
     static float anchors[18] = {10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326};
@@ -146,11 +144,11 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
     int feat_h = letterbox_rows / stride;
     int cls_num = 80;
     int anchor_group;
-    if(stride == 8)
+    if (stride == 8)
         anchor_group = 1;
-    if(stride == 16)
+    if (stride == 16)
         anchor_group = 2;
-    if(stride == 32)
+    if (stride == 32)
         anchor_group = 3;
     for (int h = 0; h <= feat_h - 1; h++)
     {
@@ -164,7 +162,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 for (int s = 0; s <= cls_num - 1; s++)
                 {
                     float score = feat[a * feat_w * feat_h * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5) + s + 5];
-                    if(score > class_score)
+                    if (score > class_score)
                     {
                         class_index = s;
                         class_score = score;
@@ -172,7 +170,7 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
                 }
                 //process box score
                 float box_score = feat[a * feat_w * feat_h * (cls_num + 5) + (h * feat_w) * (cls_num + 5) + w * (cls_num + 5) + 4];
-                float final_score = sigmoid(box_score ) * sigmoid(class_score);
+                float final_score = sigmoid(box_score) * sigmoid(class_score);
                 if (final_score >= prob_threshold)
                 {
                     int loc_idx = a * feat_h * feat_w * (cls_num + 5) + h * feat_w * (cls_num + 5) + w * (cls_num + 5);
@@ -208,16 +206,15 @@ static void generate_proposals(int stride,  const float* feat, float prob_thresh
 static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
 {
     static const char* class_names[] = {
-            "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-            "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
-            "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-            "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
-            "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
-            "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
-            "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
-            "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
-            "hair drier", "toothbrush"
-    };
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"};
 
     cv::Mat image = bgr.clone();
 
@@ -256,8 +253,8 @@ static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
 void show_usage()
 {
     fprintf(
-            stderr,
-            "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
+        stderr,
+        "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
 }
 
 void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int letterbox_rows, int letterbox_cols, const float* mean,
@@ -275,9 +272,12 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
     float scale_letterbox;
     int resize_rows;
     int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
         scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
+    }
+    else
+    {
         scale_letterbox = letterbox_cols * 1.0 / img.cols;
     }
     resize_cols = int(scale_letterbox * img.cols);
@@ -286,7 +286,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
     cv::resize(img, img, cv::Size(resize_cols, resize_rows));
     img.convertTo(img, CV_32FC3);
     // Generate a gray image for letterbox using opencv
-    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3,cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2]));
+    cv::Mat img_new(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2]));
     int top = (letterbox_rows - resize_rows) / 2;
     int bot = (letterbox_rows - resize_rows + 1) / 2;
     int left = (letterbox_cols - resize_cols) / 2;
@@ -295,7 +295,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
     cv::copyMakeBorder(img, img_new, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
 
     img_new.convertTo(img_new, CV_32FC3);
-    float* img_data   = (float* )img_new.data;
+    float* img_data = (float*)img_new.data;
     std::vector<float> input_temp(3 * letterbox_cols * letterbox_rows);
 
     /* nhwc to nchw */
@@ -305,7 +305,7 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
         {
             for (int c = 0; c < 3; c++)
             {
-                int in_index  = h * letterbox_cols * 3 + w * 3 + c;
+                int in_index = h * letterbox_cols * 3 + w * 3 + c;
                 int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w;
                 input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c];
             }
@@ -319,20 +319,15 @@ void get_input_data_focus_uint8(const char* image_file, uint8_t* input_data, int
         {
             for (int c = 0; c < 3; c++)
             {
-                for (int h = 0; h < letterbox_rows/2; h++)
+                for (int h = 0; h < letterbox_rows / 2; h++)
                 {
-                    for (int w = 0; w < letterbox_cols/2; w++)
+                    for (int w = 0; w < letterbox_cols / 2; w++)
                     {
-                        int in_index  = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows +
-                                        h * 2 * letterbox_cols + w * 2;
-                        int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        g * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        c * (letterbox_cols/2) * (letterbox_rows/2) +
-                                        h * (letterbox_cols/2) +
-                                        w;
+                        int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2;
+                        int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w;
 
                         /* quant to uint8 */
-                        int udata = (round)(input_temp[in_index] / input_scale + ( float )zero_point);
+                        int udata = (round)(input_temp[in_index] / input_scale + (float)zero_point);
                         if (udata > 255)
                             udata = 255;
                         else if (udata < 0)
@@ -366,23 +361,23 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'i':
-                image_file = optarg;
-                break;
-            case 'r':
-                repeat_count = atoi(optarg);
-                break;
-            case 't':
-                num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -498,7 +493,7 @@ int main(int argc, char* argv[])
         max_time = std::max(max_time, cur);
     }
     fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, num_thread,
-            total_time/repeat_count, max_time, min_time);
+            total_time / repeat_count, max_time, min_time);
     fprintf(stderr, "--------------------------------------\n");
 
     /* yolov5 postprocess */
@@ -529,23 +524,23 @@ int main(int argc, char* argv[])
     std::vector<float> p16_data(p16_count);
     std::vector<float> p32_data(p32_count);
 
-    uint8_t* p8_data_u8  = ( uint8_t* )get_tensor_buffer(p8_output);
-    uint8_t* p16_data_u8 = ( uint8_t* )get_tensor_buffer(p16_output);
-    uint8_t* p32_data_u8 = ( uint8_t* )get_tensor_buffer(p32_output);
+    uint8_t* p8_data_u8 = (uint8_t*)get_tensor_buffer(p8_output);
+    uint8_t* p16_data_u8 = (uint8_t*)get_tensor_buffer(p16_output);
+    uint8_t* p32_data_u8 = (uint8_t*)get_tensor_buffer(p32_output);
 
     for (int c = 0; c < p8_count; c++)
     {
-        p8_data[c] = (( float )p8_data_u8[c] - ( float )p8_zero_point) * p8_scale;
+        p8_data[c] = ((float)p8_data_u8[c] - (float)p8_zero_point) * p8_scale;
     }
 
     for (int c = 0; c < p16_count; c++)
     {
-        p16_data[c] = (( float )p16_data_u8[c] - ( float )p16_zero_point) * p16_scale;
+        p16_data[c] = ((float)p16_data_u8[c] - (float)p16_zero_point) * p16_scale;
     }
 
     for (int c = 0; c < p32_count; c++)
     {
-        p32_data[c] = (( float )p32_data_u8[c] - ( float )p32_zero_point) * p32_scale;
+        p32_data[c] = ((float)p32_data_u8[c] - (float)p32_zero_point) * p32_scale;
     }
 
     /* postprocess */
@@ -562,7 +557,7 @@ int main(int argc, char* argv[])
     proposals.insert(proposals.end(), objects32.begin(), objects32.end());
     generate_proposals(16, p16_data.data(), prob_threshold, objects16, letterbox_cols, letterbox_rows);
     proposals.insert(proposals.end(), objects16.begin(), objects16.end());
-    generate_proposals( 8, p8_data.data(), prob_threshold, objects8, letterbox_cols, letterbox_rows);
+    generate_proposals(8, p8_data.data(), prob_threshold, objects8, letterbox_cols, letterbox_rows);
     proposals.insert(proposals.end(), objects8.begin(), objects8.end());
 
     qsort_descent_inplace(proposals);
@@ -574,9 +569,12 @@ int main(int argc, char* argv[])
     float scale_letterbox;
     int resize_rows;
     int resize_cols;
-    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols)) {
+    if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+    {
         scale_letterbox = letterbox_rows * 1.0 / img.rows;
-    } else {
+    }
+    else
+    {
         scale_letterbox = letterbox_cols * 1.0 / img.cols;
     }
     resize_cols = int(scale_letterbox * img.cols);
@@ -589,7 +587,7 @@ int main(int argc, char* argv[])
     float ratio_y = (float)img.cols / resize_cols;
 
     int count = picked.size();
-    fprintf(stderr, "detection num: %d\n",count);
+    fprintf(stderr, "detection num: %d\n", count);
 
     objects.resize(count);
     for (int i = 0; i < count; i++)
@@ -623,4 +621,3 @@ int main(int argc, char* argv[])
     destroy_graph(graph);
     release_tengine();
 }
-
diff --git a/tests/op/test_onnx_op.h b/tests/op/test_onnx_op.h
index 621baa85d..df9477dd7 100644
--- a/tests/op/test_onnx_op.h
+++ b/tests/op/test_onnx_op.h
@@ -39,7 +39,6 @@
 
 #include "onnx.pb.h"
 
-
 int get_pb_data(float* float_data, const std::string& filepath)
 {
     std::ifstream fs(filepath.c_str(), std::ifstream::in | std::ifstream::binary);
@@ -155,16 +154,16 @@ int get_pb_data_i32(int32_t* i32_data, const std::string& filepath)
 
 int float_mismatch(float* current, float* reference, int size)
 {
-    for(int i=0;i<size;i++)
+    for (int i = 0; i < size; i++)
     {
         float tmp = fabs(current[i]) - fabs(reference[i]);
-        if(fabs(tmp) > 0.0001)
+        if (fabs(tmp) > 0.0001)
         {
             fprintf(stderr, "test failed, index:%d, a:%f, b:%f\n", i, current[i], reference[i]);
             return -1;
         }
     }
-    fprintf(stderr,"test pass\n");
+    fprintf(stderr, "test pass\n");
 
     return 0;
 }
diff --git a/tests/op/test_onnx_op_abs.cpp b/tests/op/test_onnx_op_abs.cpp
index 387b2c3c2..de0a67a47 100644
--- a/tests/op/test_onnx_op_abs.cpp
+++ b/tests/op/test_onnx_op_abs.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_abs";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_abs";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_acos.cpp b/tests/op/test_onnx_op_acos.cpp
index 6a713fadd..26a30dc65 100644
--- a/tests/op/test_onnx_op_acos.cpp
+++ b/tests/op/test_onnx_op_acos.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_acos";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_acos";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_add.cpp b/tests/op/test_onnx_op_add.cpp
index ea634972f..a99c3b4f8 100644
--- a/tests/op/test_onnx_op_add.cpp
+++ b/tests/op/test_onnx_op_add.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_add";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_add";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_asin.cpp b/tests/op/test_onnx_op_asin.cpp
index babcb91ae..42b458d2e 100644
--- a/tests/op/test_onnx_op_asin.cpp
+++ b/tests/op/test_onnx_op_asin.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_asin";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_asin";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_atan.cpp b/tests/op/test_onnx_op_atan.cpp
index 6e35a9238..fd68814a9 100644
--- a/tests/op/test_onnx_op_atan.cpp
+++ b/tests/op/test_onnx_op_atan.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_atan";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_atan";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_averagepool_2d_default.cpp b/tests/op/test_onnx_op_averagepool_2d_default.cpp
index b6a096435..9a9e2fbbc 100644
--- a/tests/op/test_onnx_op_averagepool_2d_default.cpp
+++ b/tests/op/test_onnx_op_averagepool_2d_default.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_averagepool_2d_default";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_averagepool_2d_default";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_averagepool_2d_pads.cpp b/tests/op/test_onnx_op_averagepool_2d_pads.cpp
index bb3087e0b..de2f7ccee 100644
--- a/tests/op/test_onnx_op_averagepool_2d_pads.cpp
+++ b/tests/op/test_onnx_op_averagepool_2d_pads.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_averagepool_2d_pads";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_averagepool_2d_pads";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_basic_conv_with_padding.cpp b/tests/op/test_onnx_op_basic_conv_with_padding.cpp
index d93f99ee3..77242d234 100644
--- a/tests/op/test_onnx_op_basic_conv_with_padding.cpp
+++ b/tests/op/test_onnx_op_basic_conv_with_padding.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_basic_conv_with_padding";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_basic_conv_with_padding";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_basic_conv_without_padding.cpp b/tests/op/test_onnx_op_basic_conv_without_padding.cpp
index e1c4a0854..1aad544f2 100644
--- a/tests/op/test_onnx_op_basic_conv_without_padding.cpp
+++ b/tests/op/test_onnx_op_basic_conv_without_padding.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_basic_conv_without_padding";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_basic_conv_without_padding";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_ceil.cpp b/tests/op/test_onnx_op_ceil.cpp
index 853470b43..16112aa41 100644
--- a/tests/op/test_onnx_op_ceil.cpp
+++ b/tests/op/test_onnx_op_ceil.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_ceil";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_ceil";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_clip_example.cpp b/tests/op/test_onnx_op_clip_example.cpp
index 587ab840f..2d61de319 100644
--- a/tests/op/test_onnx_op_clip_example.cpp
+++ b/tests/op/test_onnx_op_clip_example.cpp
@@ -22,15 +22,14 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node        = "test_clip_example";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
-std::string input_pb_2  = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
-std::string output_pb   = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model       = "../onnx_node/" + node + "/onnx.tmfile";
+std::string node = "test_clip_example";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -107,7 +106,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* input 2 */
     int input_size_2 = w_2;
@@ -154,7 +153,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_concat_1d_axis_0.cpp b/tests/op/test_onnx_op_concat_1d_axis_0.cpp
index 9919d7d3d..71e6527ac 100644
--- a/tests/op/test_onnx_op_concat_1d_axis_0.cpp
+++ b/tests/op/test_onnx_op_concat_1d_axis_0.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_concat_1d_axis_0";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_concat_1d_axis_0";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -87,7 +86,7 @@ int main(int argc, char* argv[])
     }
 
     /* input 1 */
-    int input_size_1 =  h_1 * w_1;
+    int input_size_1 = h_1 * w_1;
     int dims_1[] = {h_1, w_1};
     std::vector<float> feature_in_1(input_size_1);
 
@@ -108,7 +107,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -130,7 +129,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_concat_2d_axis_0.cpp b/tests/op/test_onnx_op_concat_2d_axis_0.cpp
index 2d41bf5b8..db7586424 100644
--- a/tests/op/test_onnx_op_concat_2d_axis_0.cpp
+++ b/tests/op/test_onnx_op_concat_2d_axis_0.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_concat_2d_axis_0";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_concat_2d_axis_0";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -87,7 +86,7 @@ int main(int argc, char* argv[])
     }
 
     /* input 1 */
-    int input_size_1 =  h_1 * w_1;
+    int input_size_1 = h_1 * w_1;
     int dims_1[] = {h_1, w_1};
     std::vector<float> feature_in_1(input_size_1);
 
@@ -108,7 +107,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -130,7 +129,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_concat_2d_axis_1.cpp b/tests/op/test_onnx_op_concat_2d_axis_1.cpp
index 5afeaedc2..81fde2127 100644
--- a/tests/op/test_onnx_op_concat_2d_axis_1.cpp
+++ b/tests/op/test_onnx_op_concat_2d_axis_1.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_concat_2d_axis_1";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_concat_2d_axis_1";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -87,7 +86,7 @@ int main(int argc, char* argv[])
     }
 
     /* input 1 */
-    int input_size_1 =  h_1 * w_1;
+    int input_size_1 = h_1 * w_1;
     int dims_1[] = {h_1, w_1};
     std::vector<float> feature_in_1(input_size_1);
 
@@ -108,7 +107,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -130,7 +129,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_concat_3d_axis_0.cpp b/tests/op/test_onnx_op_concat_3d_axis_0.cpp
index 849d01b8f..ac4ea7f27 100644
--- a/tests/op/test_onnx_op_concat_3d_axis_0.cpp
+++ b/tests/op/test_onnx_op_concat_3d_axis_0.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_concat_3d_axis_0";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_concat_3d_axis_0";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -110,7 +109,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -132,7 +131,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_concat_3d_axis_1.cpp b/tests/op/test_onnx_op_concat_3d_axis_1.cpp
index cc3abc1e7..c1a58baa5 100644
--- a/tests/op/test_onnx_op_concat_3d_axis_1.cpp
+++ b/tests/op/test_onnx_op_concat_3d_axis_1.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_concat_3d_axis_1";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_concat_3d_axis_1";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -110,7 +109,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -132,7 +131,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_concat_3d_axis_2.cpp b/tests/op/test_onnx_op_concat_3d_axis_2.cpp
index 7a1167db3..5cf3306cc 100644
--- a/tests/op/test_onnx_op_concat_3d_axis_2.cpp
+++ b/tests/op/test_onnx_op_concat_3d_axis_2.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_concat_3d_axis_2";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_concat_3d_axis_2";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -110,7 +109,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -132,7 +131,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_conv_with_strides_no_padding.cpp b/tests/op/test_onnx_op_conv_with_strides_no_padding.cpp
index b48855cef..5adb63080 100644
--- a/tests/op/test_onnx_op_conv_with_strides_no_padding.cpp
+++ b/tests/op/test_onnx_op_conv_with_strides_no_padding.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_conv_with_strides_no_padding";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_conv_with_strides_no_padding";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_conv_with_strides_padding.cpp b/tests/op/test_onnx_op_conv_with_strides_padding.cpp
index f1f3407a7..c22adaf04 100644
--- a/tests/op/test_onnx_op_conv_with_strides_padding.cpp
+++ b/tests/op/test_onnx_op_conv_with_strides_padding.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_conv_with_strides_padding";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_conv_with_strides_padding";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_convtranspose.cpp b/tests/op/test_onnx_op_convtranspose.cpp
index 0db15b36d..71fcbf801 100644
--- a/tests/op/test_onnx_op_convtranspose.cpp
+++ b/tests/op/test_onnx_op_convtranspose.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_convtranspose";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_convtranspose";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_convtranspose_dilations.cpp b/tests/op/test_onnx_op_convtranspose_dilations.cpp
index 51cdad8a8..6c7a7b6b8 100644
--- a/tests/op/test_onnx_op_convtranspose_dilations.cpp
+++ b/tests/op/test_onnx_op_convtranspose_dilations.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_convtranspose_dilations";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_convtranspose_dilations";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_convtranspose_pad.cpp b/tests/op/test_onnx_op_convtranspose_pad.cpp
index 510c4d271..83e439c76 100644
--- a/tests/op/test_onnx_op_convtranspose_pad.cpp
+++ b/tests/op/test_onnx_op_convtranspose_pad.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_convtranspose_pad";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_convtranspose_pad";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_convtranspose_pads.cpp b/tests/op/test_onnx_op_convtranspose_pads.cpp
index ff97a6871..ed16a8a77 100644
--- a/tests/op/test_onnx_op_convtranspose_pads.cpp
+++ b/tests/op/test_onnx_op_convtranspose_pads.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_convtranspose_pads";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_convtranspose_pads";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_cos.cpp b/tests/op/test_onnx_op_cos.cpp
index d5ba50fd7..b8b8c20a5 100644
--- a/tests/op/test_onnx_op_cos.cpp
+++ b/tests/op/test_onnx_op_cos.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_cos";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_cos";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_depthtospace_dcr_mode.cpp b/tests/op/test_onnx_op_depthtospace_dcr_mode.cpp
index 4d7dd503f..d33409af4 100644
--- a/tests/op/test_onnx_op_depthtospace_dcr_mode.cpp
+++ b/tests/op/test_onnx_op_depthtospace_dcr_mode.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_depthtospace_dcr_mode";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_depthtospace_dcr_mode";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_div.cpp b/tests/op/test_onnx_op_div.cpp
index 50b13cd5e..240e73316 100644
--- a/tests/op/test_onnx_op_div.cpp
+++ b/tests/op/test_onnx_op_div.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_div";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_div";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_dropout_default.cpp b/tests/op/test_onnx_op_dropout_default.cpp
index 139c59fa5..266220f9a 100644
--- a/tests/op/test_onnx_op_dropout_default.cpp
+++ b/tests/op/test_onnx_op_dropout_default.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_dropout_default";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_dropout_default";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -78,7 +77,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -99,7 +98,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_elu.cpp b/tests/op/test_onnx_op_elu.cpp
index 9e868e435..3692035e6 100644
--- a/tests/op/test_onnx_op_elu.cpp
+++ b/tests/op/test_onnx_op_elu.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_elu";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_elu";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_equal.cpp b/tests/op/test_onnx_op_equal.cpp
index 5af989aad..02ee50e8f 100644
--- a/tests/op/test_onnx_op_equal.cpp
+++ b/tests/op/test_onnx_op_equal.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_equal";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_equal";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_exp.cpp b/tests/op/test_onnx_op_exp.cpp
index cab9a7d46..132bbf9be 100644
--- a/tests/op/test_onnx_op_exp.cpp
+++ b/tests/op/test_onnx_op_exp.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_exp";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_exp";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_expand_dim_unchanged.cpp b/tests/op/test_onnx_op_expand_dim_unchanged.cpp
index 964a0faa7..961254b78 100644
--- a/tests/op/test_onnx_op_expand_dim_unchanged.cpp
+++ b/tests/op/test_onnx_op_expand_dim_unchanged.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_expand_dim_unchanged";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_expand_dim_unchanged";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -86,7 +85,7 @@ int main(int argc, char* argv[])
     }
 
     /* input 1 */
-    int input_size_1 =  w_1;
+    int input_size_1 = w_1;
     int dims_1[] = {w_1};
     std::vector<int32_t> feature_in_1(input_size_1);
 
@@ -121,7 +120,6 @@ int main(int argc, char* argv[])
     /* prepare process input data, set the data mem to input tensor */
     get_pb_data(feature_in_0.data(), input_pb_0);
 
-
     /* run graph */
     if (run_graph(graph, 1) < 0)
     {
@@ -131,7 +129,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_floor.cpp b/tests/op/test_onnx_op_floor.cpp
index b520e2385..8957d081d 100644
--- a/tests/op/test_onnx_op_floor.cpp
+++ b/tests/op/test_onnx_op_floor.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_floor";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_floor";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_globalaveragepool.cpp b/tests/op/test_onnx_op_globalaveragepool.cpp
index b013fd306..9a5724dae 100644
--- a/tests/op/test_onnx_op_globalaveragepool.cpp
+++ b/tests/op/test_onnx_op_globalaveragepool.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_globalaveragepool";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_globalaveragepool";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_greater.cpp b/tests/op/test_onnx_op_greater.cpp
index 5eb64587d..d4323308e 100644
--- a/tests/op/test_onnx_op_greater.cpp
+++ b/tests/op/test_onnx_op_greater.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_greater";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_greater";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_gru_defaults.cpp b/tests/op/test_onnx_op_gru_defaults.cpp
index 8dc4620db..9e8c9ec4a 100644
--- a/tests/op/test_onnx_op_gru_defaults.cpp
+++ b/tests/op/test_onnx_op_gru_defaults.cpp
@@ -22,15 +22,14 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node        = "test_gru_defaults";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
-std::string input_pb_2  = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
-std::string output_pb   = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model       = "../onnx_node/" + node + "/onnx.tmfile";
+std::string node = "test_gru_defaults";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -118,7 +117,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* input 2 */
     int input_size_2 = c_2 * h_2 * w_2;
@@ -165,7 +164,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_gru_seq_length.cpp b/tests/op/test_onnx_op_gru_seq_length.cpp
index a47569ffc..938611efc 100644
--- a/tests/op/test_onnx_op_gru_seq_length.cpp
+++ b/tests/op/test_onnx_op_gru_seq_length.cpp
@@ -22,16 +22,15 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node        = "test_gru_seq_length";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
-std::string input_pb_2  = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
-std::string input_pb_3  = "../onnx_node/" + node + "/test_data_set_0/input_3.pb";
-std::string output_pb   = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model       = "../onnx_node/" + node + "/onnx.tmfile";
+std::string node = "test_gru_seq_length";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
+std::string input_pb_3 = "../onnx_node/" + node + "/test_data_set_0/input_3.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -53,7 +52,7 @@ int main(int argc, char* argv[])
     int n_3 = 1;
     int c_3 = 1;
     int h_3 = 1;
-    int w_3 = 30;    
+    int w_3 = 30;
 
     /* set runtime options */
     struct options opt;
@@ -124,7 +123,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* input 2 */
     int input_size_2 = c_2 * h_2 * w_2;
@@ -148,7 +147,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }      
+    }
 
     /* input 3 */
     int input_size_3 = h_3 * w_3;
@@ -196,7 +195,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_gru_with_initial_bias.cpp b/tests/op/test_onnx_op_gru_with_initial_bias.cpp
index 3e93c3601..e516c0259 100644
--- a/tests/op/test_onnx_op_gru_with_initial_bias.cpp
+++ b/tests/op/test_onnx_op_gru_with_initial_bias.cpp
@@ -22,16 +22,15 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node        = "test_gru_with_initial_bias";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
-std::string input_pb_2  = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
-std::string input_pb_3  = "../onnx_node/" + node + "/test_data_set_0/input_3.pb";
-std::string output_pb   = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model       = "../onnx_node/" + node + "/onnx.tmfile";
+std::string node = "test_gru_with_initial_bias";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
+std::string input_pb_3 = "../onnx_node/" + node + "/test_data_set_0/input_3.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -53,7 +52,7 @@ int main(int argc, char* argv[])
     int n_3 = 1;
     int c_3 = 1;
     int h_3 = 1;
-    int w_3 = 18;    
+    int w_3 = 18;
 
     /* set runtime options */
     struct options opt;
@@ -124,7 +123,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* input 2 */
     int input_size_2 = c_2 * h_2 * w_2;
@@ -148,7 +147,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }      
+    }
 
     /* input 3 */
     int input_size_3 = h_3 * w_3;
@@ -196,7 +195,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_hardsigmoid.cpp b/tests/op/test_onnx_op_hardsigmoid.cpp
index 9a55568e0..8497ed69b 100644
--- a/tests/op/test_onnx_op_hardsigmoid.cpp
+++ b/tests/op/test_onnx_op_hardsigmoid.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_hardsigmoid";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_hardsigmoid";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_instancenorm_epsilon.cpp b/tests/op/test_onnx_op_instancenorm_epsilon.cpp
index 35d599e63..fb59641ff 100644
--- a/tests/op/test_onnx_op_instancenorm_epsilon.cpp
+++ b/tests/op/test_onnx_op_instancenorm_epsilon.cpp
@@ -22,15 +22,14 @@
  * Author: sqfu@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_instancenorm_epsilon";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
-std::string input_pb_2  = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
+std::string node = "test_instancenorm_epsilon";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -70,11 +69,11 @@ int main(int argc, char* argv[])
         fprintf(stderr, "Create graph failed.\n");
         return -1;
     }
-//    set_log_level(LOG_INFO);
-//    dump_graph(graph);
+    //    set_log_level(LOG_INFO);
+    //    dump_graph(graph);
 
     /* set the shape, data buffer of input_tensor of the graph */
-        /* input 0 */
+    /* input 0 */
     int input_size_0 = n_0 * c_0 * h_0 * w_0;
     int dims[] = {n_0, c_0, h_0, w_0};
     std::vector<float> feature_in_0(input_size_0);
@@ -97,7 +96,7 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-        /* input 1 */
+    /* input 1 */
     int input_size_1 = n_1 * c_1 * h_1 * w_1;
     int dims_1[] = {n_1, c_1, h_1, w_1};
     std::vector<float> feature_in_1(input_size_1);
@@ -118,9 +117,9 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor_1 buffer failed\n");
         return -1;
-    } 
+    }
 
-        /* input 2 */
+    /* input 2 */
     int input_size_2 = n_2 * c_2 * h_2 * w_2;
     int dims_2[] = {n_2, c_2, h_2, w_2};
     std::vector<float> feature_in_2(input_size_2);
@@ -141,7 +140,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor_2 buffer failed\n");
         return -1;
-    } 
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -164,7 +163,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_instancenorm_example.cpp b/tests/op/test_onnx_op_instancenorm_example.cpp
index 3331cc5f6..cb19cb2cf 100644
--- a/tests/op/test_onnx_op_instancenorm_example.cpp
+++ b/tests/op/test_onnx_op_instancenorm_example.cpp
@@ -22,15 +22,14 @@
  * Author: sqfu@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_instancenorm_example";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
-std::string input_pb_2  = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
+std::string node = "test_instancenorm_example";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -70,11 +69,11 @@ int main(int argc, char* argv[])
         fprintf(stderr, "Create graph failed.\n");
         return -1;
     }
-//    set_log_level(LOG_INFO);
-//    dump_graph(graph);
+    //    set_log_level(LOG_INFO);
+    //    dump_graph(graph);
 
     /* set the shape, data buffer of input_tensor of the graph */
-        /* input 0 */
+    /* input 0 */
     int input_size_0 = n_0 * c_0 * h_0 * w_0;
     int dims[] = {n_0, c_0, h_0, w_0};
     std::vector<float> feature_in_0(input_size_0);
@@ -97,7 +96,7 @@ int main(int argc, char* argv[])
         return -1;
     }
 
-        /* input 1 */
+    /* input 1 */
     int input_size_1 = n_1 * c_1 * h_1 * w_1;
     int dims_1[] = {n_1, c_1, h_1, w_1};
     std::vector<float> feature_in_1(input_size_1);
@@ -118,9 +117,9 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor_1 buffer failed\n");
         return -1;
-    } 
+    }
 
-        /* input 2 */
+    /* input 2 */
     int input_size_2 = n_2 * c_2 * h_2 * w_2;
     int dims_2[] = {n_2, c_2, h_2, w_2};
     std::vector<float> feature_in_2(input_size_2);
@@ -141,7 +140,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor_2 buffer failed\n");
         return -1;
-    } 
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -164,7 +163,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_leakyrelu.cpp b/tests/op/test_onnx_op_leakyrelu.cpp
index 89ee9ab11..1bbbb3976 100644
--- a/tests/op/test_onnx_op_leakyrelu.cpp
+++ b/tests/op/test_onnx_op_leakyrelu.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_leakyrelu";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_leakyrelu";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_less.cpp b/tests/op/test_onnx_op_less.cpp
index ad286141b..4976bdb66 100644
--- a/tests/op/test_onnx_op_less.cpp
+++ b/tests/op/test_onnx_op_less.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_less";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_less";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_log.cpp b/tests/op/test_onnx_op_log.cpp
index 9e9792a6a..b88123f1f 100644
--- a/tests/op/test_onnx_op_log.cpp
+++ b/tests/op/test_onnx_op_log.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_log";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_log";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_logsoftmax_default_axis.cpp b/tests/op/test_onnx_op_logsoftmax_default_axis.cpp
index b5939e428..aa2293a01 100644
--- a/tests/op/test_onnx_op_logsoftmax_default_axis.cpp
+++ b/tests/op/test_onnx_op_logsoftmax_default_axis.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_logsoftmax_default_axis";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_logsoftmax_default_axis";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_lstm_defaults.cpp b/tests/op/test_onnx_op_lstm_defaults.cpp
index 48d307e46..d5de60070 100644
--- a/tests/op/test_onnx_op_lstm_defaults.cpp
+++ b/tests/op/test_onnx_op_lstm_defaults.cpp
@@ -22,15 +22,14 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node        = "test_lstm_defaults";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
-std::string input_pb_2  = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
-std::string output_pb   = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model       = "../onnx_node/" + node + "/onnx.tmfile";
+std::string node = "test_lstm_defaults";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -118,7 +117,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* input 2 */
     int input_size_2 = c_2 * h_2 * w_2;
@@ -165,7 +164,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_lstm_with_initial_bias.cpp b/tests/op/test_onnx_op_lstm_with_initial_bias.cpp
index d76dc4b41..d5f61ad52 100644
--- a/tests/op/test_onnx_op_lstm_with_initial_bias.cpp
+++ b/tests/op/test_onnx_op_lstm_with_initial_bias.cpp
@@ -22,16 +22,15 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node        = "test_lstm_with_initial_bias";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
-std::string input_pb_2  = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
-std::string input_pb_3  = "../onnx_node/" + node + "/test_data_set_0/input_3.pb";
-std::string output_pb   = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model       = "../onnx_node/" + node + "/onnx.tmfile";
+std::string node = "test_lstm_with_initial_bias";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string input_pb_2 = "../onnx_node/" + node + "/test_data_set_0/input_2.pb";
+std::string input_pb_3 = "../onnx_node/" + node + "/test_data_set_0/input_3.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -53,7 +52,7 @@ int main(int argc, char* argv[])
     int n_3 = 1;
     int c_3 = 1;
     int h_3 = 1;
-    int w_3 = 32;    
+    int w_3 = 32;
 
     /* set runtime options */
     struct options opt;
@@ -124,7 +123,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* input 2 */
     int input_size_2 = c_2 * h_2 * w_2;
@@ -148,7 +147,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }      
+    }
 
     /* input 3 */
     int input_size_3 = h_3 * w_3;
@@ -196,7 +195,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_matmul_2d.cpp b/tests/op/test_onnx_op_matmul_2d.cpp
index 866b9cd24..ef7f75c5b 100644
--- a/tests/op/test_onnx_op_matmul_2d.cpp
+++ b/tests/op/test_onnx_op_matmul_2d.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_matmul_2d";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_matmul_2d";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -108,7 +107,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -130,7 +129,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_matmul_3d.cpp b/tests/op/test_onnx_op_matmul_3d.cpp
index 28ed0612b..67e16bec1 100644
--- a/tests/op/test_onnx_op_matmul_3d.cpp
+++ b/tests/op/test_onnx_op_matmul_3d.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_matmul_3d";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_matmul_3d";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -110,7 +109,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -132,7 +131,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_matmul_4d.cpp b/tests/op/test_onnx_op_matmul_4d.cpp
index dcdc86d5c..02308791f 100644
--- a/tests/op/test_onnx_op_matmul_4d.cpp
+++ b/tests/op/test_onnx_op_matmul_4d.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_matmul_4d";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_matmul_4d";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_maxpool_2d_default.cpp b/tests/op/test_onnx_op_maxpool_2d_default.cpp
index a6316ffa8..6f5ef2fce 100644
--- a/tests/op/test_onnx_op_maxpool_2d_default.cpp
+++ b/tests/op/test_onnx_op_maxpool_2d_default.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_maxpool_2d_default";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_maxpool_2d_default";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_maxpool_2d_dilations.cpp b/tests/op/test_onnx_op_maxpool_2d_dilations.cpp
index d063e4851..475499f69 100644
--- a/tests/op/test_onnx_op_maxpool_2d_dilations.cpp
+++ b/tests/op/test_onnx_op_maxpool_2d_dilations.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_maxpool_2d_dilations";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_maxpool_2d_dilations";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_maxpool_2d_pads.cpp b/tests/op/test_onnx_op_maxpool_2d_pads.cpp
index 02f757e66..7de587e1a 100644
--- a/tests/op/test_onnx_op_maxpool_2d_pads.cpp
+++ b/tests/op/test_onnx_op_maxpool_2d_pads.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_maxpool_2d_pads";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_maxpool_2d_pads";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_neg.cpp b/tests/op/test_onnx_op_neg.cpp
index 1805caf47..b9c54fbe1 100644
--- a/tests/op/test_onnx_op_neg.cpp
+++ b/tests/op/test_onnx_op_neg.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_neg";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_neg";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_pow.cpp b/tests/op/test_onnx_op_pow.cpp
index b8a4dcfbd..c8a129d42 100644
--- a/tests/op/test_onnx_op_pow.cpp
+++ b/tests/op/test_onnx_op_pow.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_pow";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_pow";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_reciprocal.cpp b/tests/op/test_onnx_op_reciprocal.cpp
index 328e84ae9..aa8f1703d 100644
--- a/tests/op/test_onnx_op_reciprocal.cpp
+++ b/tests/op/test_onnx_op_reciprocal.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_reciprocal";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_reciprocal";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_reduce_log_sum_default.cpp b/tests/op/test_onnx_op_reduce_log_sum_default.cpp
index 7871749a2..967d53786 100644
--- a/tests/op/test_onnx_op_reduce_log_sum_default.cpp
+++ b/tests/op/test_onnx_op_reduce_log_sum_default.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_reduce_log_sum_default";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_reduce_log_sum_default";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp b/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp
index 078f728d4..22114ea6b 100644
--- a/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp
+++ b/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_reduce_max_default_axes_keepdim_example";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_reduce_max_default_axes_keepdim_example";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp b/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp
index 7ee3e5e48..ef0a74ad2 100644
--- a/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp
+++ b/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_reduce_mean_default_axes_keepdims_example";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_reduce_mean_default_axes_keepdims_example";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp b/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp
index b3522ce9c..6eea307cf 100644
--- a/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp
+++ b/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_reduce_min_default_axes_keepdims_example";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_reduce_min_default_axes_keepdims_example";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp b/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp
index 724c300fa..e07d3fb26 100644
--- a/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp
+++ b/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_reduce_sum_square_default_axes_keepdims_example";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_reduce_sum_square_default_axes_keepdims_example";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_relu.cpp b/tests/op/test_onnx_op_relu.cpp
index ac53247da..f8f6b70f9 100644
--- a/tests/op/test_onnx_op_relu.cpp
+++ b/tests/op/test_onnx_op_relu.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_relu";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_relu";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_selu.cpp b/tests/op/test_onnx_op_selu.cpp
index af057807f..35510186b 100644
--- a/tests/op/test_onnx_op_selu.cpp
+++ b/tests/op/test_onnx_op_selu.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_selu";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_selu";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_selu_default.cpp b/tests/op/test_onnx_op_selu_default.cpp
index 6225d028c..510c6fc6a 100644
--- a/tests/op/test_onnx_op_selu_default.cpp
+++ b/tests/op/test_onnx_op_selu_default.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_selu_default";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_selu_default";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_softmax_default_axis.cpp b/tests/op/test_onnx_op_softmax_default_axis.cpp
index 57581c1d9..81aff8276 100644
--- a/tests/op/test_onnx_op_softmax_default_axis.cpp
+++ b/tests/op/test_onnx_op_softmax_default_axis.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_softmax_default_axis";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_softmax_default_axis";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_softplus.cpp b/tests/op/test_onnx_op_softplus.cpp
index c6b3c1cc8..3a6a9bb15 100644
--- a/tests/op/test_onnx_op_softplus.cpp
+++ b/tests/op/test_onnx_op_softplus.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_softplus";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_softplus";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_squeeze.cpp b/tests/op/test_onnx_op_squeeze.cpp
index 2fdccb469..d5406e476 100644
--- a/tests/op/test_onnx_op_squeeze.cpp
+++ b/tests/op/test_onnx_op_squeeze.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_squeeze";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_squeeze";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_sub.cpp b/tests/op/test_onnx_op_sub.cpp
index 9a2f75db0..137afed64 100644
--- a/tests/op/test_onnx_op_sub.cpp
+++ b/tests/op/test_onnx_op_sub.cpp
@@ -22,14 +22,13 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_sub";
-std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
-std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string node = "test_sub";
+std::string input_pb_0 = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1 = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -112,7 +111,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -134,7 +133,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_tanh.cpp b/tests/op/test_onnx_op_tanh.cpp
index 66c974957..78b2d1628 100644
--- a/tests/op/test_onnx_op_tanh.cpp
+++ b/tests/op/test_onnx_op_tanh.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_tanh";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_tanh";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_onnx_op_unsqueeze_axis_1.cpp b/tests/op/test_onnx_op_unsqueeze_axis_1.cpp
index 2bf5406e7..99105f29e 100644
--- a/tests/op/test_onnx_op_unsqueeze_axis_1.cpp
+++ b/tests/op/test_onnx_op_unsqueeze_axis_1.cpp
@@ -22,13 +22,12 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_onnx_op.h"
 
-std::string node      = "test_unsqueeze_axis_1";
-std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string node = "test_unsqueeze_axis_1";
+std::string input_pb = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
 std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
-std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+std::string model = "../onnx_node/" + node + "/onnx.tmfile";
 
 int main(int argc, char* argv[])
 {
@@ -81,7 +80,7 @@ int main(int argc, char* argv[])
     {
         fprintf(stderr, "Set input tensor buffer failed\n");
         return -1;
-    }    
+    }
 
     /* prerun graph, set work options(num_thread, cluster, precision) */
     if (prerun_graph_multithread(graph, opt) < 0)
@@ -102,7 +101,7 @@ int main(int argc, char* argv[])
 
     /* get the current result of inference */
     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
-    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
 
     /* get the reference result of inference */
diff --git a/tests/op/test_op.h b/tests/op/test_op.h
index b08f81e44..dd9c97007 100644
--- a/tests/op/test_op.h
+++ b/tests/op/test_op.h
@@ -18,11 +18,10 @@
 #include "graph/tensor.h"
 
 #define TENSOR_SHOW_LEADING_BLANK "    "
-#define TENSOR_FLOAT_EPSILON 0.0001f
+#define TENSOR_FLOAT_EPSILON      0.0001f
 
 typedef int (*common_test)(graph_t, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w);
 
-
 void dump_tensor_line(void* data_ptr, int offset, int data_type, int w)
 {
     if (0 >= w)
@@ -33,80 +32,79 @@ void dump_tensor_line(void* data_ptr, int offset, int data_type, int w)
 
     printf("[ ");
 
-    switch(data_type)
+    switch (data_type)
+    {
+    case TENGINE_DT_FP32:
     {
-        case TENGINE_DT_FP32:
+        float* p = (float*)data_ptr;
+
+        for (int i = 0; i < w - 1; i++)
         {
-            float* p = ( float* )data_ptr;
+            printf("%0.2f, ", p[offset + i]);
+        }
+        printf("%0.2f ", p[offset + w - 1]);
 
-            for(int i = 0; i < w - 1; i++)
-            {
-                printf("%0.2f, ", p[offset + i]);
-            }
-            printf("%0.2f ", p[offset + w - 1]);
+        break;
+    }
+    case TENGINE_DT_FP16:
+    {
+        __fp16* p = (__fp16*)data_ptr;
 
-            break;
+#ifdef __ARM_ARCH
+        for (int i = 0; i < w - 1; i++)
+        {
+            printf("%f, ", (float)p[offset + i]);
         }
-        case TENGINE_DT_FP16:
+        printf("%f ", (float)p[offset + w - 1]);
+#else
+        for (int i = 0; i < w - 1; i++)
         {
-            __fp16* p = ( __fp16* )data_ptr;
+            printf("%f, ", fp16_to_fp32(p[offset + i]));
+        }
+        printf("%f ", fp16_to_fp32(p[offset + w - 1]));
+#endif
+        break;
+    }
+    case TENGINE_DT_INT8:
+    case TENGINE_DT_UINT8:
+    {
+        if (data_type == TENGINE_DT_INT8)
+        {
+            int8_t* p = (int8_t*)data_ptr;
 
-#ifdef __ARM_ARCH
-            for(int i = 0; i < w - 1; i++)
+            for (int i = 0; i < w - 1; i++)
             {
-                printf("%f, ", (float)p[offset + i]);
+                printf("%d, ", (int)p[offset + i]);
             }
-            printf("%f ", (float)p[offset + w - 1]);
-#else
-            for(int i = 0; i < w - 1; i++)
-            {
-                printf("%f, ", fp16_to_fp32(p[offset + i]));
-            }
-            printf("%f ", fp16_to_fp32(p[offset + w - 1]));
-#endif
-            break;
+            printf("%d ", (int)p[offset + w - 1]);
         }
-        case TENGINE_DT_INT8:
-        case TENGINE_DT_UINT8:
+        else
         {
-            if(data_type == TENGINE_DT_INT8)
-            {
-                int8_t* p = ( int8_t* )data_ptr;
+            uint8_t* p = (uint8_t*)data_ptr;
 
-                for(int i = 0; i < w - 1; i++)
-                {
-                    printf("%d, ", (int)p[offset + i]);
-                }
-                printf("%d ", (int)p[offset + w - 1]);
-            }
-            else
+            for (int i = 0; i < w - 1; i++)
             {
-                uint8_t* p = ( uint8_t* )data_ptr;
-
-                for(int i = 0; i < w - 1; i++)
-                {
-                    printf("%d, ", (int)p[offset + i]);
-                }
-                printf("%d ", (int)p[offset + w - 1]);
+                printf("%d, ", (int)p[offset + i]);
             }
-
-            break;
+            printf("%d ", (int)p[offset + w - 1]);
         }
-        default:
-            // not deal with TENGINE_DT_INT16 and TENGINE_DT_INT32
-            fprintf(stderr, "Unsupported data type for now. ");
+
+        break;
+    }
+    default:
+        // not deal with TENGINE_DT_INT16 and TENGINE_DT_INT32
+        fprintf(stderr, "Unsupported data type for now. ");
     }
 
     printf("]");
 }
 
-
 void dump_tensor(tensor_t tensor, const char* message)
 {
     int data_type = get_tensor_data_type(tensor);
     void* data_ptr = get_tensor_buffer(tensor);
 
-    int dim_array[MAX_SHAPE_DIM_NUM] = { 0 };
+    int dim_array[MAX_SHAPE_DIM_NUM] = {0};
     int dim_count = get_tensor_shape(tensor, dim_array, MAX_SHAPE_DIM_NUM);
     if (0 >= dim_count)
         fprintf(stderr, "Cannot get tensor shape.");
@@ -119,34 +117,34 @@ void dump_tensor(tensor_t tensor, const char* message)
 
     switch (dim_count)
     {
-        case 4:
-        {
-            n = dim_array[0];
-            c = dim_array[1];
-            h = dim_array[2];
-            w = dim_array[3];
-            break;
-        }
-        case 3:
-        {
-            c = dim_array[0];
-            h = dim_array[1];
-            w = dim_array[2];
-            break;
-        }
-        case 2:
-        {
-            h = dim_array[0];
-            w = dim_array[1];
-            break;
-        }
-        case 1:
-        {
-            w = dim_array[0];
-            break;
-        }
-        default:
-            fprintf(stderr, "Cannot found the type of tensor.\n");
+    case 4:
+    {
+        n = dim_array[0];
+        c = dim_array[1];
+        h = dim_array[2];
+        w = dim_array[3];
+        break;
+    }
+    case 3:
+    {
+        c = dim_array[0];
+        h = dim_array[1];
+        w = dim_array[2];
+        break;
+    }
+    case 2:
+    {
+        h = dim_array[0];
+        w = dim_array[1];
+        break;
+    }
+    case 1:
+    {
+        w = dim_array[0];
+        break;
+    }
+    default:
+        fprintf(stderr, "Cannot found the type of tensor.\n");
     }
 
     // print leader
@@ -182,11 +180,10 @@ void dump_tensor(tensor_t tensor, const char* message)
     printf("].\n");
 }
 
-
 void dump_node_input(node_t test_node, int index)
 {
     tensor_t tensor = get_node_input_tensor(test_node, index);
-    if(NULL == tensor)
+    if (NULL == tensor)
     {
         fprintf(stderr, "Get input tensor(%d) from the node failed.\n", index);
         return;
@@ -200,11 +197,10 @@ void dump_node_input(node_t test_node, int index)
     release_graph_tensor(tensor);
 }
 
-
 void dump_node_output(node_t test_node, int index)
 {
     tensor_t tensor = get_node_output_tensor(test_node, index);
-    if(NULL == tensor)
+    if (NULL == tensor)
     {
         fprintf(stderr, "Get output tensor from the node failed.\n");
         return;
@@ -218,7 +214,6 @@ void dump_node_output(node_t test_node, int index)
     release_graph_tensor(tensor);
 }
 
-
 int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w, int data_type, int layout)
 {
     node_t node = create_graph_node(graph, node_name, "InputOp");
@@ -229,7 +224,7 @@ int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w
     }
 
     tensor_t tensor = create_graph_tensor(graph, node_name, data_type);
-    if(NULL == tensor)
+    if (NULL == tensor)
     {
         release_graph_node(node);
 
@@ -239,13 +234,13 @@ int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w
 
     set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT);
 
-    if(TENGINE_LAYOUT_NCHW == layout)
+    if (TENGINE_LAYOUT_NCHW == layout)
     {
         int dims[4] = {n, c, h, w};
         set_tensor_shape(tensor, dims, 4);
     }
 
-    if(TENGINE_LAYOUT_NHWC == layout)
+    if (TENGINE_LAYOUT_NHWC == layout)
     {
         int dims[4] = {n, h, w, c};
         set_tensor_shape(tensor, dims, 4);
@@ -257,7 +252,6 @@ int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w
     return 0;
 }
 
-
 int create_input_node(graph_t graph, const char* node_name, int data_type, int layout, int n, int c, int h, int w, int dims_count = 4)
 {
     if (0 == n) dims_count = 3;
@@ -277,7 +271,7 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l
     }
 
     tensor_t tensor = create_graph_tensor(graph, node_name, data_type);
-    if(NULL == tensor)
+    if (NULL == tensor)
     {
         release_graph_node(node);
 
@@ -287,7 +281,7 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l
     }
 
     int ret = set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT);
-    if(0 != ret)
+    if (0 != ret)
     {
         release_graph_tensor(tensor);
         release_graph_node(node);
@@ -297,70 +291,70 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l
         return -1;
     }
 
-    switch(dims_count)
+    switch (dims_count)
+    {
+    case 1:
+    {
+        int dims_array[1] = {w};
+        set_tensor_shape(tensor, dims_array, dims_count);
+        break;
+    }
+    case 2:
     {
-        case 1:
+        int dims_array[2] = {h, w};
+        set_tensor_shape(tensor, dims_array, dims_count);
+        break;
+    }
+    case 3:
+    {
+        if (TENGINE_LAYOUT_NCHW == layout)
         {
-            int dims_array[1] = { w };
+            int dims_array[3] = {c, h, w};
             set_tensor_shape(tensor, dims_array, dims_count);
             break;
         }
-        case 2:
+
+        if (TENGINE_LAYOUT_NHWC == layout)
         {
-            int dims_array[2] = { h, w };
+            int dims_array[3] = {h, w, c};
             set_tensor_shape(tensor, dims_array, dims_count);
             break;
         }
-        case 3:
+    }
+    case 4:
+    {
+        if (TENGINE_LAYOUT_NCHW == layout)
         {
-            if (TENGINE_LAYOUT_NCHW == layout)
-            {
-                int dims_array[3] = { c, h, w };
-                set_tensor_shape(tensor, dims_array, dims_count);
-                break;
-            }
-
-            if (TENGINE_LAYOUT_NHWC == layout)
-            {
-                int dims_array[3] = { h, w, c };
-                set_tensor_shape(tensor, dims_array, dims_count);
-                break;
-            }
+            int dims_array[4] = {n, c, h, w};
+            set_tensor_shape(tensor, dims_array, dims_count);
+            break;
         }
-        case 4:
-        {
-            if (TENGINE_LAYOUT_NCHW == layout)
-            {
-                int dims_array[4] = { n, c, h, w };
-                set_tensor_shape(tensor, dims_array, dims_count);
-                break;
-            }
 
-            if (TENGINE_LAYOUT_NHWC == layout)
-            {
-                int dims_array[4] = { n, h, w, c };
-                set_tensor_shape(tensor, dims_array, dims_count);
-                break;
-            }
+        if (TENGINE_LAYOUT_NHWC == layout)
+        {
+            int dims_array[4] = {n, h, w, c};
+            set_tensor_shape(tensor, dims_array, dims_count);
+            break;
         }
-        case 5:
+    }
+    case 5:
+    {
+        if (TENGINE_LAYOUT_NCHW == layout)
         {
-            if (TENGINE_LAYOUT_NCHW == layout)
-            {
-                int dims_array[5] = {1, n, c, h, w };
-                set_tensor_shape(tensor, dims_array, dims_count);
-                break;
-            }
+            int dims_array[5] = {1, n, c, h, w};
+            set_tensor_shape(tensor, dims_array, dims_count);
+            break;
+        }
 
-            if (TENGINE_LAYOUT_NHWC == layout)
-            {
-                int dims_array[5] = {1, n, h, w, c };
-                set_tensor_shape(tensor, dims_array, dims_count);
-                break;
-            }
+        if (TENGINE_LAYOUT_NHWC == layout)
+        {
+            int dims_array[5] = {1, n, h, w, c};
+            set_tensor_shape(tensor, dims_array, dims_count);
+            break;
         }
-        default:
-            fprintf(stderr, "Cannot support %d dims tensor.\n", dims_count);
+    }
+    default:
+        fprintf(stderr, "Cannot support %d dims tensor.\n", dims_count);
     }
 
     release_graph_tensor(tensor);
@@ -369,7 +363,6 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l
     return 0;
 }
 
-
 int fill_fp32_tensor(tensor_t tensor, float value)
 {
     int dims[MAX_SHAPE_DIM_NUM];
@@ -394,7 +387,6 @@ int fill_fp32_tensor(tensor_t tensor, float value)
     return 0;
 }
 
-
 int fill_uint8_tensor(tensor_t tensor, float value)
 {
     int dims[MAX_SHAPE_DIM_NUM];
@@ -416,7 +408,7 @@ int fill_uint8_tensor(tensor_t tensor, float value)
     int input_zero_point = 0;
     get_tensor_quant_param(tensor, &input_scale, &input_zero_point, 1);
 
-    uint8_t * data_ptr = (uint8_t *)get_tensor_buffer(tensor);
+    uint8_t* data_ptr = (uint8_t*)get_tensor_buffer(tensor);
     for (int i = 0; i < element_count; i++)
     {
         int udata = (round)(value / input_scale + (float)input_zero_point);
@@ -430,118 +422,112 @@ int fill_uint8_tensor(tensor_t tensor, float value)
     return 0;
 }
 
-
 void fill_input_float_tensor_by_index(graph_t graph, int input_node_index, int tensor_index, float value)
 {
     tensor_t tensor = get_graph_input_tensor(graph, input_node_index, tensor_index);
-    if(NULL == tensor)
+    if (NULL == tensor)
         fprintf(stderr, "Cannot find the %dth tensor via node index(%d).\n", tensor_index, input_node_index);
 
     int buf_size = get_tensor_buffer_size(tensor);
-    float* data = (float* )malloc(buf_size);
+    float* data = (float*)malloc(buf_size);
 
-//    for(int i = 0; i < buf_size/sizeof(float); i++)
-//        data[i] = value;
+    //    for(int i = 0; i < buf_size/sizeof(float); i++)
+    //        data[i] = value;
 
-    int ret = set_tensor_buffer(tensor, (void* )data, buf_size);
-    if(0 != ret)
+    int ret = set_tensor_buffer(tensor, (void*)data, buf_size);
+    if (0 != ret)
         fprintf(stderr, "Set buffer for tensor failed.\n");
 
     ret = fill_fp32_tensor(tensor, value);
-    if(0 != ret)
+    if (0 != ret)
         fprintf(stderr, "Fill buffer for tensor failed.\n");
 }
 
-
 void fill_input_uint8_tensor_by_index(graph_t graph, int input_node_index, int tensor_index, float value)
 {
     tensor_t tensor = get_graph_input_tensor(graph, input_node_index, tensor_index);
-    if(NULL == tensor)
+    if (NULL == tensor)
         fprintf(stderr, "Cannot find the %dth tensor via node index(%d).\n", tensor_index, input_node_index);
 
     int buf_size = get_tensor_buffer_size(tensor);
-    uint8_t* data = (uint8_t* )malloc(buf_size);
+    uint8_t* data = (uint8_t*)malloc(buf_size);
 
-    int ret = set_tensor_buffer(tensor, (void* )data, buf_size);
-    if(0 != ret)
+    int ret = set_tensor_buffer(tensor, (void*)data, buf_size);
+    if (0 != ret)
         fprintf(stderr, "Set buffer for tensor failed.\n");
 
     ret = fill_uint8_tensor(tensor, value);
-    if(0 != ret)
+    if (0 != ret)
         fprintf(stderr, "Fill buffer for tensor failed.\n");
 }
 
-
 void fill_input_float_tensor_by_name(graph_t graph, const char* node_name, int tensor_index, float value)
 {
     node_t node = get_graph_node(graph, node_name);
-    if(NULL == node)
+    if (NULL == node)
         fprintf(stderr, "Cannot get node via node name(%s).\n", node_name);
 
     tensor_t tensor = get_node_input_tensor(node, tensor_index);
-    if(NULL == tensor)
+    if (NULL == tensor)
         fprintf(stderr, "Cannot find the %dth tensor via node name(%s)\n", tensor_index, node_name);
 
     int buf_size = get_tensor_buffer_size(tensor);
-    float* data = (float* )malloc(buf_size);
+    float* data = (float*)malloc(buf_size);
 
-//    for(unsigned int i = 0; i < buf_size/sizeof(float) ; i++)
-//        data[i] = value;
+    //    for(unsigned int i = 0; i < buf_size/sizeof(float) ; i++)
+    //        data[i] = value;
 
-    int ret = set_tensor_buffer(tensor, (void* )data, buf_size);
-    if(0 != ret)
+    int ret = set_tensor_buffer(tensor, (void*)data, buf_size);
+    if (0 != ret)
         fprintf(stderr, "Set buffer for tensor failed.\n");
 
     ret = fill_fp32_tensor(tensor, value);
-    if(0 != ret)
+    if (0 != ret)
         fprintf(stderr, "Fill buffer for tensor failed.\n");
 }
 
-
 void fill_input_float_buffer_tensor_by_name(graph_t graph, const char* node_name, int tensor_index, void* value, int buf_size)
 {
     node_t node = get_graph_node(graph, node_name);
-    if(NULL == node)
+    if (NULL == node)
         fprintf(stderr, "Cannot get node via node name(%s).\n", node_name);
 
     tensor_t tensor = get_node_input_tensor(node, tensor_index);
-    if(NULL == tensor)
+    if (NULL == tensor)
         fprintf(stderr, "Cannot find the %dth tensor via node name(%s).\n", tensor_index, node_name);
 
     int ret = set_tensor_buffer(tensor, value, buf_size);
-    if(0 != ret)
+    if (0 != ret)
         fprintf(stderr, "Set buffer for tensor failed.\n");
 }
 
-
 void fill_input_integer_tensor_by_name(graph_t graph, const char* node_name, int tensor_index, int value)
 {
     node_t node = get_graph_node(graph, node_name);
-    if(NULL == node)
+    if (NULL == node)
     {
         fprintf(stderr, "Cannot get node via node name(%s).\n", node_name);
         return;
     }
 
     tensor_t tensor = get_node_input_tensor(node, tensor_index);
-    if(NULL == tensor)
+    if (NULL == tensor)
     {
         fprintf(stderr, "Cannot find the %dth tensor via node name(%s).\n", tensor_index, node_name);
         return;
     }
 
     int buf_size = get_tensor_buffer_size(tensor);
-    int* data = (int* )malloc(buf_size);
+    int* data = (int*)malloc(buf_size);
 
-    for(unsigned int i = 0; i < buf_size/sizeof(int) ; i++)
+    for (unsigned int i = 0; i < buf_size / sizeof(int); i++)
         data[i] = value;
 
-    int ret = set_tensor_buffer(tensor, (void* )data, buf_size);
-    if(0 != ret)
+    int ret = set_tensor_buffer(tensor, (void*)data, buf_size);
+    if (0 != ret)
         fprintf(stderr, "Set buffer for tensor failed.\n");
 }
 
-
 int test_graph_init()
 {
     // now init tengine will mask critical filed and return an error
@@ -551,10 +537,9 @@ int test_graph_init()
     return 0;
 }
 
-
 int test_graph_run(graph_t graph)
 {
-    if(prerun_graph(graph) < 0)
+    if (prerun_graph(graph) < 0)
     {
         fprintf(stderr, "Pre-run graph failed.\n");
         return -1;
@@ -571,7 +556,6 @@ int test_graph_run(graph_t graph)
     return 0;
 }
 
-
 void test_graph_release(graph_t graph)
 {
     postrun_graph(graph);
@@ -579,30 +563,29 @@ void test_graph_release(graph_t graph)
     release_tengine();
 }
 
-
 graph_t create_common_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4)
 {
     graph_t graph = create_graph(NULL, NULL, NULL);
-    if(NULL == graph)
+    if (NULL == graph)
     {
         fprintf(stderr, "get graph failed.\n");
         return NULL;
     }
 
-    if(set_graph_layout(graph, layout) < 0)
+    if (set_graph_layout(graph, layout) < 0)
     {
         fprintf(stderr, "set layout failed.\n");
         return NULL;
     }
 
     const char* input_name = "input_node";
-    if(create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
+    if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
     {
         fprintf(stderr, "create input node failed.\n");
         return NULL;
     }
 
-    if(test_func(graph, input_name, test_node_name, data_type, layout, n, c, h ,w) < 0)
+    if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0)
     {
         fprintf(stderr, "create test node failed.\n");
         return NULL;
@@ -612,13 +595,13 @@ graph_t create_common_test_graph(const char* test_node_name, int data_type, int
     const char* inputs[] = {input_name};
     const char* outputs[] = {test_node_name};
 
-    if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
+    if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set inputs failed.\n");
         return NULL;
     }
 
-    if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
+    if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set outputs failed.\n");
         return NULL;
@@ -627,7 +610,6 @@ graph_t create_common_test_graph(const char* test_node_name, int data_type, int
     return graph;
 }
 
-
 graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4)
 {
     /* create VeriSilicon TIM-VX backend */
@@ -640,26 +622,26 @@ graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int l
     }
 
     graph_t graph = create_graph(timvx_context, NULL, NULL);
-    if(NULL == graph)
+    if (NULL == graph)
     {
         fprintf(stderr, "get graph failed.\n");
         return NULL;
     }
 
-    if(set_graph_layout(graph, layout) < 0)
+    if (set_graph_layout(graph, layout) < 0)
     {
         fprintf(stderr, "set layout failed.\n");
         return NULL;
     }
 
     const char* input_name = "input_node";
-    if(create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
+    if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
     {
         fprintf(stderr, "create input node failed.\n");
         return NULL;
     }
 
-    if(test_func(graph, input_name, test_node_name, data_type, layout, n, c, h ,w) < 0)
+    if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0)
     {
         fprintf(stderr, "create test node failed.\n");
         return NULL;
@@ -669,13 +651,13 @@ graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int l
     const char* inputs[] = {input_name};
     const char* outputs[] = {test_node_name};
 
-    if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
+    if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set inputs failed.\n");
         return NULL;
     }
 
-    if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
+    if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set outputs failed.\n");
         return NULL;
@@ -696,26 +678,26 @@ graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, in
     }
 
     graph_t graph = create_graph(timvx_context, NULL, NULL);
-    if(NULL == graph)
+    if (NULL == graph)
     {
         fprintf(stderr, "get graph failed.\n");
         return NULL;
     }
 
-    if(set_graph_layout(graph, layout) < 0)
+    if (set_graph_layout(graph, layout) < 0)
     {
         fprintf(stderr, "set layout failed.\n");
         return NULL;
     }
 
     const char* input_name = "input_node";
-    if(create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
+    if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
     {
         fprintf(stderr, "create input node failed.\n");
         return NULL;
     }
 
-    if(test_func(graph, input_name, test_node_name, data_type, layout, n, c, h ,w) < 0)
+    if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0)
     {
         fprintf(stderr, "create test node failed.\n");
         return NULL;
@@ -725,13 +707,13 @@ graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, in
     const char* inputs[] = {input_name};
     const char* outputs[] = {test_node_name};
 
-    if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
+    if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set inputs failed.\n");
         return NULL;
     }
 
-    if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
+    if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set outputs failed.\n");
         return NULL;
@@ -743,26 +725,26 @@ graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, in
 graph_t create_cpu_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4)
 {
     graph_t graph = create_graph(NULL, NULL, NULL);
-    if(NULL == graph)
+    if (NULL == graph)
     {
         fprintf(stderr, "get graph failed.\n");
         return NULL;
     }
 
-    if(set_graph_layout(graph, layout) < 0)
+    if (set_graph_layout(graph, layout) < 0)
     {
         fprintf(stderr, "set layout failed.\n");
         return NULL;
     }
 
     const char* input_name = "input_node";
-    if(create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
+    if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
     {
         fprintf(stderr, "create input node failed.\n");
         return NULL;
     }
 
-    if(test_func(graph, input_name, test_node_name, data_type, layout, n, c, h ,w) < 0)
+    if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0)
     {
         fprintf(stderr, "create test node failed.\n");
         return NULL;
@@ -772,13 +754,13 @@ graph_t create_cpu_test_graph(const char* test_node_name, int data_type, int lay
     const char* inputs[] = {input_name};
     const char* outputs[] = {test_node_name};
 
-    if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
+    if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set inputs failed.\n");
         return NULL;
     }
 
-    if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
+    if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set outputs failed.\n");
         return NULL;
@@ -818,75 +800,74 @@ int compare_tensor(tensor_t a, tensor_t b)
 
     switch (a_type)
     {
-        case TENGINE_DT_FP32:
-        {
-            float* a_data_ptr = (float*)get_tensor_buffer(a);
-            float* b_data_ptr = (float*)get_tensor_buffer(b);
-
-            for (int i = 0; i < element_size; i++)
-                if (fabsf(a_data_ptr[i] - b_data_ptr[i]) < TENSOR_FLOAT_EPSILON)
-                    return -1;
+    case TENGINE_DT_FP32:
+    {
+        float* a_data_ptr = (float*)get_tensor_buffer(a);
+        float* b_data_ptr = (float*)get_tensor_buffer(b);
 
-            break;
-        }
-        case TENGINE_DT_FP16:
-        {
-            __fp16* a_data_ptr = (__fp16*)get_tensor_buffer(a);
-            __fp16* b_data_ptr = (__fp16*)get_tensor_buffer(b);
+        for (int i = 0; i < element_size; i++)
+            if (fabsf(a_data_ptr[i] - b_data_ptr[i]) < TENSOR_FLOAT_EPSILON)
+                return -1;
 
-            for (int i = 0; i < element_size; i++)
-            {
-                if (fabsf((float)fp16_to_fp32(a_data_ptr[i]) - (float)fp16_to_fp32(b_data_ptr[i])) < TENSOR_FLOAT_EPSILON)
-                    return -1;
-            }
+        break;
+    }
+    case TENGINE_DT_FP16:
+    {
+        __fp16* a_data_ptr = (__fp16*)get_tensor_buffer(a);
+        __fp16* b_data_ptr = (__fp16*)get_tensor_buffer(b);
 
-            break;
-        }
-        case TENGINE_DT_INT32:
+        for (int i = 0; i < element_size; i++)
         {
-            int32_t* a_data_ptr = (int32_t*)get_tensor_buffer(a);
-            int32_t* b_data_ptr = (int32_t*)get_tensor_buffer(b);
+            if (fabsf((float)fp16_to_fp32(a_data_ptr[i]) - (float)fp16_to_fp32(b_data_ptr[i])) < TENSOR_FLOAT_EPSILON)
+                return -1;
+        }
 
-            for (int i = 0; i < element_size; i++)
-                if (a_data_ptr[i] != b_data_ptr[i])
-                    return -1;
+        break;
+    }
+    case TENGINE_DT_INT32:
+    {
+        int32_t* a_data_ptr = (int32_t*)get_tensor_buffer(a);
+        int32_t* b_data_ptr = (int32_t*)get_tensor_buffer(b);
 
-            break;
-        }
-        case TENGINE_DT_INT16:
-        {
-            int16_t* a_data_ptr = (int16_t*)get_tensor_buffer(a);
-            int16_t* b_data_ptr = (int16_t*)get_tensor_buffer(b);
+        for (int i = 0; i < element_size; i++)
+            if (a_data_ptr[i] != b_data_ptr[i])
+                return -1;
 
-            for (int i = 0; i < element_size; i++)
-                if (a_data_ptr[i] != b_data_ptr[i])
-                    return -1;
+        break;
+    }
+    case TENGINE_DT_INT16:
+    {
+        int16_t* a_data_ptr = (int16_t*)get_tensor_buffer(a);
+        int16_t* b_data_ptr = (int16_t*)get_tensor_buffer(b);
 
-            break;
-        }
-        case TENGINE_DT_UINT8:
-        case TENGINE_DT_INT8:
-        {
-            int8_t* a_data_ptr = (int8_t*)get_tensor_buffer(a);
-            int8_t* b_data_ptr = (int8_t*)get_tensor_buffer(b);
+        for (int i = 0; i < element_size; i++)
+            if (a_data_ptr[i] != b_data_ptr[i])
+                return -1;
 
-            for (int i = 0; i < element_size; i++)
-                if (a_data_ptr[i] != b_data_ptr[i])
-                    return -1;
+        break;
+    }
+    case TENGINE_DT_UINT8:
+    case TENGINE_DT_INT8:
+    {
+        int8_t* a_data_ptr = (int8_t*)get_tensor_buffer(a);
+        int8_t* b_data_ptr = (int8_t*)get_tensor_buffer(b);
 
-            break;
-        }
-        default:
-        {
-            fprintf(stderr, "The type of tensor was not supported.\n");
-            return -1;
-        }
+        for (int i = 0; i < element_size; i++)
+            if (a_data_ptr[i] != b_data_ptr[i])
+                return -1;
+
+        break;
+    }
+    default:
+    {
+        fprintf(stderr, "The type of tensor was not supported.\n");
+        return -1;
+    }
     }
 
     return 0;
 }
 
-
 static inline unsigned long get_current_time(void)
 {
     struct timespec tm;
diff --git a/tests/op/test_op_conv.c b/tests/op/test_op_conv.c
index c1799d471..5a8cffaa2 100644
--- a/tests/op/test_op_conv.c
+++ b/tests/op/test_op_conv.c
@@ -24,7 +24,7 @@
 
 #include <stdio.h>
 #include <string.h>
-#include <malloc.h>           
+#include <malloc.h>
 
 #include "tengine/c_api.h"
 #include "tengine/c_api_ex.h"
@@ -41,41 +41,41 @@ void record_allocated_buf(void* buf)
 
 void free_allocated_buf(void)
 {
-    for(int i = 0; i < allocated_num; i++)
+    for (int i = 0; i < allocated_num; i++)
         free(record_ptr[i]);
 
-    if(record_ptr)
+    if (record_ptr)
         free(record_ptr);
 }
 
 void init_buffer(void* buf, int elem_num, int elem_size, int val)
 {
-    for(int i = 0; i < elem_num; i++)
+    for (int i = 0; i < elem_num; i++)
     {
         float val0;
         float* fp;
         int16_t* i16;
         char* c;
 
-        if(val >= 0)
+        if (val >= 0)
             val0 = val;
         else
-            val0 = i%10;
+            val0 = i % 10;
 
-        switch(elem_size)
+        switch (elem_size)
         {
-            case 4:
-                fp = ( float* )buf;
-                fp[i] = val0;
-                break;
-            case 2:
-                i16 = ( int16_t* )buf;
-                i16[i] = val0;
-                break;
-            case 1:
-                c = ( char* )buf;
-                c[i] = val0;
-                break;
+        case 4:
+            fp = (float*)buf;
+            fp[i] = val0;
+            break;
+        case 2:
+            i16 = (int16_t*)buf;
+            i16[i] = val0;
+            break;
+        case 1:
+            c = (char*)buf;
+            c[i] = val0;
+            break;
         }
     }
 }
@@ -129,7 +129,7 @@ int create_conv_node(graph_t graph, const char* node_name, const char* input_nam
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(input_tensor == NULL)
+    if (input_tensor == NULL)
     {
         fprintf(stderr, "errno= %d\n", get_tengine_errno());
         return -1;
@@ -178,7 +178,7 @@ int create_pooling_node(graph_t graph, const char* node_name, const char* input_
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(input_tensor == NULL)
+    if (input_tensor == NULL)
     {
         fprintf(stderr, "ERRNO: %d\n", get_tengine_errno());
         return -1;
@@ -202,7 +202,7 @@ graph_t create_test_graph(int c, int h, int w, int out_c)
 {
     graph_t graph = create_graph(NULL, NULL, NULL);
 
-    if(graph == NULL)
+    if (graph == NULL)
     {
         fprintf(stderr, "ERRNO: %d\n", get_tengine_errno());
         return NULL;
@@ -211,7 +211,7 @@ graph_t create_test_graph(int c, int h, int w, int out_c)
     const char* input_name = "data";
     const char* conv_name = "conv";
 
-    if(create_input_node(graph, input_name, c, h, w) < 0)
+    if (create_input_node(graph, input_name, c, h, w) < 0)
     {
         fprintf(stderr, "create input failed\n");
         return NULL;
@@ -219,7 +219,7 @@ graph_t create_test_graph(int c, int h, int w, int out_c)
 
     // int out_c = 4;
     //                                                k  s  p in_c out_c group
-    if(create_conv_node(graph, conv_name, input_name, 1, 1, 0, c, out_c, 1) < 0)
+    if (create_conv_node(graph, conv_name, input_name, 1, 1, 0, c, out_c, 1) < 0)
     {
         fprintf(stderr, "create conv node failed\n");
         return NULL;
@@ -243,13 +243,13 @@ graph_t create_test_graph(int c, int h, int w, int out_c)
 
 #endif
 
-    if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
+    if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set inputs failed: ERRNO: %d\n", get_tengine_errno());
         return NULL;
     }
 
-    if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
+    if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set outputs failed: ERRNO: %d\n", get_tengine_errno());
         return NULL;
@@ -280,7 +280,7 @@ void fill_conv_node(node_t node)
 
     tensor_t bias = get_node_input_tensor(node, 2);
 
-    if(bias == NULL)
+    if (bias == NULL)
         return;
 
     get_tensor_shape(bias, dims, 1);
@@ -302,13 +302,13 @@ void fill_graph_param(graph_t graph)
 {
     int node_num = get_graph_node_num(graph);
 
-    for(int i = 0; i < node_num; i++)
+    for (int i = 0; i < node_num; i++)
     {
         node_t node = get_graph_node_by_idx(graph, i);
 
         const char* node_op = get_node_op(node);
 
-        if(!strcmp(node_op, "Convolution"))
+        if (!strcmp(node_op, "Convolution"))
         {
             fill_conv_node(node);
         }
@@ -329,8 +329,8 @@ int main(int argc, char* argv[])
     init_tengine();
 
     graph_t graph = create_test_graph(c, h, w, out_c);
- 
-    if(graph == NULL)
+
+    if (graph == NULL)
         return 1;
 
     fill_graph_param(graph);
@@ -344,7 +344,7 @@ int main(int argc, char* argv[])
     int elem_num = 1;
     int elem_size = 4;
 
-    for(int i = 0; i < dim_num; i++)
+    for (int i = 0; i < dim_num; i++)
         elem_num *= dims[i];
 
     void* input_buf = malloc(elem_num * elem_size);
@@ -369,7 +369,7 @@ int main(int argc, char* argv[])
 
     printf("output shape: [");
 
-    for(int i = 0; i < dim_num; i++)
+    for (int i = 0; i < dim_num; i++)
     {
         elem_num *= dims[i];
         printf(" %d", dims[i]);
@@ -379,11 +379,11 @@ int main(int argc, char* argv[])
 
     float* output = get_tensor_buffer(output_tensor);
 
-    for(int i = 0; i < elem_num; i++)
+    for (int i = 0; i < elem_num; i++)
     {
         int w = dims[3];
 
-        if((i % w) == 0)
+        if ((i % w) == 0)
             printf("\n%d:\t", i);
 
         printf(" %f", output[i]);
diff --git a/tests/op/test_op_prelu.c b/tests/op/test_op_prelu.c
index 16f6ee3b9..dd31e4b1e 100644
--- a/tests/op/test_op_prelu.c
+++ b/tests/op/test_op_prelu.c
@@ -24,17 +24,20 @@
 
 #include "test_op.h"
 
-
 int create_test_prelu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
     node_t test_node = create_graph_node(graph, node_name, "PReLU");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed. ERRNO: %d.\n", get_tengine_errno());
         return -1;
@@ -47,7 +50,7 @@ int create_test_prelu_node(graph_t graph, const char* input_name, const char* no
 
     int dims[4];
     get_tensor_shape(input_tensor, dims, 4);
-    int slope_dims[1] = {dims[1]};  // channel num
+    int slope_dims[1] = {dims[1]}; // channel num
     set_tensor_shape(slope_tensor, slope_dims, 1);
 
     /* input tensors of test node */
@@ -78,7 +81,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_prelu_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set input data
@@ -102,9 +105,9 @@ int main(int argc, char* argv[])
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
 
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_data =  (float *)output_tensor->data + i * cstep;
+        float* output_data = (float*)output_tensor->data + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (output_data[j] != result_value[i])
diff --git a/tests/op/test_op_relu.c b/tests/op/test_op_relu.c
index fd8583023..730ab3260 100644
--- a/tests/op/test_op_relu.c
+++ b/tests/op/test_op_relu.c
@@ -24,10 +24,13 @@
 
 #include "test_op.h"
 
-
 int create_test_relu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
     node_t test_node = create_graph_node(graph, node_name, "ReLU");
@@ -66,7 +69,6 @@ int create_test_relu_node(graph_t graph, const char* input_name, const char* nod
     return 0;
 }
 
-
 int main(int argc, char* argv[])
 {
     int n = 1, c = 3, h = 12, w = 12;
@@ -81,7 +83,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set input data
@@ -98,7 +100,7 @@ int main(int argc, char* argv[])
 
     // dump input node
     int input_node_count = get_graph_input_node_number(graph);
-    for(int i = 0; i < input_node_count; i++)
+    for (int i = 0; i < input_node_count; i++)
     {
         node_t input = get_graph_input_node(graph, i);
         dump_node_output(input, 0);
@@ -106,7 +108,7 @@ int main(int argc, char* argv[])
 
     // dump output node
     int output_node_count = get_graph_output_node_number(graph);
-    for(int i = 0; i < output_node_count; i++)
+    for (int i = 0; i < output_node_count; i++)
     {
         node_t output = get_graph_output_node(graph, i);
         dump_node_output(output, 0);
diff --git a/tests/op/test_op_relu6.c b/tests/op/test_op_relu6.c
index 1f772e97d..9315c6477 100644
--- a/tests/op/test_op_relu6.c
+++ b/tests/op/test_op_relu6.c
@@ -24,10 +24,13 @@
 
 #include "test_op.h"
 
-
 int create_test_relu6_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
     node_t test_node = create_graph_node(graph, node_name, "ReLU6");
@@ -66,7 +69,6 @@ int create_test_relu6_node(graph_t graph, const char* input_name, const char* no
     return 0;
 }
 
-
 int main(int argc, char* argv[])
 {
     int n = 1, c = 3, h = 12, w = 12;
@@ -81,7 +83,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu6_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set input data
@@ -98,7 +100,7 @@ int main(int argc, char* argv[])
 
     // dump input node
     int input_node_count = get_graph_input_node_number(graph);
-    for(int i = 0; i < input_node_count; i++)
+    for (int i = 0; i < input_node_count; i++)
     {
         node_t input = get_graph_input_node(graph, i);
         dump_node_output(input, 0);
@@ -106,7 +108,7 @@ int main(int argc, char* argv[])
 
     // dump output node
     int output_node_count = get_graph_output_node_number(graph);
-    for(int i = 0; i < output_node_count; i++)
+    for (int i = 0; i < output_node_count; i++)
     {
         node_t output = get_graph_output_node(graph, i);
         dump_node_output(output, 0);
diff --git a/tests/op/test_tensorrt_op_clip.cpp b/tests/op/test_tensorrt_op_clip.cpp
index d166d7dc4..fef74a3ee 100644
--- a/tests/op/test_tensorrt_op_clip.cpp
+++ b/tests/op/test_tensorrt_op_clip.cpp
@@ -1,119 +1,120 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-
-int create_test_clip_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Clip");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    return 0;
-}
-
-float input_fp32[5] = {-3.0f, 3.0f, 8.0f, 1.0f, -2.0f};
-
-float reference_out[5] = {0.0f, 3.0f, 6.0f, 1.0f, 0.0f};
-
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 5, w = 1;
-    const char* test_node_name = "clip";
-    int data_type = TENGINE_DT_FP32;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    graph_t graph = create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_clip_node);
-    if(NULL == graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(graph);
-
-    // set quantize params
-    struct tensor* input_tensor =  (struct tensor*)get_graph_input_tensor(graph, 0, 0);
-    struct tensor* output_tensor =  (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-
-    // set input data
-    set_tensor_buffer(input_tensor, input_fp32, 5 * 4);
-
-    // graph run
-    ret = test_graph_run(graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(graph);
-        return -1;
-    }
-
-    // get output and dequant
-    float* output_data = ( float* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+int create_test_clip_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Clip");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    return 0;
+}
+
+float input_fp32[5] = {-3.0f, 3.0f, 8.0f, 1.0f, -2.0f};
+
+float reference_out[5] = {0.0f, 3.0f, 6.0f, 1.0f, 0.0f};
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 5, w = 1;
+    const char* test_node_name = "clip";
+    int data_type = TENGINE_DT_FP32;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    graph_t graph = create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_clip_node);
+    if (NULL == graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_input_tensor(graph, 0, 0);
+    struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
+
+    // set input data
+    set_tensor_buffer(input_tensor, input_fp32, 5 * 4);
+
+    // graph run
+    ret = test_graph_run(graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(graph);
+        return -1;
+    }
+
+    // get output and dequant
+    float* output_data = (float*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(graph);
+
+    return ret;
+}
diff --git a/tests/op/test_tensorrt_op_concat.cpp b/tests/op/test_tensorrt_op_concat.cpp
index 42668a05f..185d620c6 100644
--- a/tests/op/test_tensorrt_op_concat.cpp
+++ b/tests/op/test_tensorrt_op_concat.cpp
@@ -1,150 +1,188 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/concat_param.h"
-
-
-int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Concat");
-
-    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
-
-    if(NULL == input0_tensor)
-    {
-        fprintf(stderr, "create test node input0 failed.\n");
-        return -1;
-    }
-
-    node_t input1_node = create_graph_node(graph, "input1", "InputOp");
-    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_FP32);
-    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_INPUT);
-    int input1_dims[4] = {1, 1, 3, 3};  // channel num
-    set_tensor_shape(input1_tensor, input1_dims, 4);
-
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input0_tensor);
-    set_node_input_tensor(test_node, 1, input1_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct concat_param* param = ( struct concat_param* )(struct node* )test_node->op.param_mem;
-
-    param->axis = 1;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,};
-
-float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,};
-
-float reference_out[18] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,
-                           9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,};
-
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 3, w = 3;
-    const char* test_node_name = "concat";
-    int data_type = TENGINE_DT_FP32;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "concat");
-
-    // set input data
-    set_tensor_buffer(input0_tensor, input0_fp32, 9 * 4);
-
-    // set input data
-    set_tensor_buffer(input1_tensor, input1_fp32, 9 * 4);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    float* output_data = ( float* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/concat_param.h"
+
+int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Concat");
+
+    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
+
+    if (NULL == input0_tensor)
+    {
+        fprintf(stderr, "create test node input0 failed.\n");
+        return -1;
+    }
+
+    node_t input1_node = create_graph_node(graph, "input1", "InputOp");
+    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_FP32);
+    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_INPUT);
+    int input1_dims[4] = {1, 1, 3, 3}; // channel num
+    set_tensor_shape(input1_tensor, input1_dims, 4);
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input0_tensor);
+    set_node_input_tensor(test_node, 1, input1_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct concat_param* param = (struct concat_param*)(struct node*)test_node->op.param_mem;
+
+    param->axis = 1;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input0_fp32[9] = {
+    3.0f,
+    8.0f,
+    1.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    3.0f,
+    2.0f,
+    3.0f,
+};
+
+float input1_fp32[9] = {
+    9.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    0.0f,
+    2.0f,
+};
+
+float reference_out[18] = {
+    3.0f,
+    8.0f,
+    1.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    3.0f,
+    2.0f,
+    3.0f,
+    9.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    0.0f,
+    2.0f,
+};
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 3, w = 3;
+    const char* test_node_name = "concat";
+    int data_type = TENGINE_DT_FP32;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "concat");
+
+    // set input data
+    set_tensor_buffer(input0_tensor, input0_fp32, 9 * 4);
+
+    // set input data
+    set_tensor_buffer(input1_tensor, input1_fp32, 9 * 4);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    float* output_data = (float*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_tensorrt_op_deconv.cpp b/tests/op/test_tensorrt_op_deconv.cpp
index fa0b7576c..7db3cb1ed 100644
--- a/tests/op/test_tensorrt_op_deconv.cpp
+++ b/tests/op/test_tensorrt_op_deconv.cpp
@@ -1,223 +1,246 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/deconv_param.h"
-
-
-int create_test_deconv_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Deconvolution");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
-    /* weight */
-    node_t weight_node = create_graph_node(graph, "weight", "Const");
-    tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_FP32);
-    set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST);
-    int weight_dims[4] = {1, 1, 3, 3};  // channel num
-    set_tensor_shape(weight_tensor, weight_dims, 4);
-
-    /* bias */
-    // node_t bias_node = create_graph_node(graph, "bias", "Const");
-    // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32);
-    // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST);
-    // int bias_dims[1] = {1};  // channel num
-    // set_tensor_shape(bias_tensor, bias_dims, 1);
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-    set_node_input_tensor(test_node, 1, weight_tensor);
-    // set_node_input_tensor(test_node, 2, bias_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct deconv_param* deconv_param = ( struct deconv_param* )(struct node* )test_node->op.param_mem;
-
-    deconv_param->num_output = 1;
-    deconv_param->kernel_h = 3;
-    deconv_param->kernel_w = 3;
-    deconv_param->stride_h = 2;
-    deconv_param->stride_w = 2;
-    deconv_param->pad_h0 = 0;
-    deconv_param->pad_w0 = 0;
-    deconv_param->pad_h1 = 0;
-    deconv_param->pad_w1 = 0;
-    deconv_param->dilation_h = 1;
-    deconv_param->dilation_w = 1;
-    deconv_param->group = 1;
-    deconv_param->activation = -1;
-    deconv_param->output_pad_h0 = 0;
-    deconv_param->output_pad_w0 = 0;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,};
-
-float weight_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,};
-
-float reference_out[49] = {27.000000,
-                           0.000000,
-                           81.000000,
-                           0.000000,
-                           33.000000,
-                           0.000000,
-                           3.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           84.000000,
-                           0.000000,
-                           86.000000,
-                           0.000000,
-                           95.000000,
-                           0.000000,
-                           23.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           36.000000,
-                           0.000000,
-                           50.000000,
-                           0.000000,
-                           50.000000,
-                           0.000000,
-                           23.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           3.000000,
-                           0.000000,
-                           8.000000,
-                           0.000000,
-                           7.000000,
-                           0.000000,
-                           6.000000, };
-
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 3, w = 3;
-    const char* test_node_name = "deconv";
-    int data_type = TENGINE_DT_FP32;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_deconv_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "deconv");
-
-    // set input data
-    set_tensor_buffer(input_tensor, input_fp32, 9 * 4);
-
-    // set weight data
-    set_tensor_buffer(weight_tensor, weight_fp32, 9 * 4);
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    float* output_data = ( float* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/deconv_param.h"
+
+int create_test_deconv_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Deconvolution");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
+    /* weight */
+    node_t weight_node = create_graph_node(graph, "weight", "Const");
+    tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_FP32);
+    set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST);
+    int weight_dims[4] = {1, 1, 3, 3}; // channel num
+    set_tensor_shape(weight_tensor, weight_dims, 4);
+
+    /* bias */
+    // node_t bias_node = create_graph_node(graph, "bias", "Const");
+    // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32);
+    // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST);
+    // int bias_dims[1] = {1};  // channel num
+    // set_tensor_shape(bias_tensor, bias_dims, 1);
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+    set_node_input_tensor(test_node, 1, weight_tensor);
+    // set_node_input_tensor(test_node, 2, bias_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct deconv_param* deconv_param = (struct deconv_param*)(struct node*)test_node->op.param_mem;
+
+    deconv_param->num_output = 1;
+    deconv_param->kernel_h = 3;
+    deconv_param->kernel_w = 3;
+    deconv_param->stride_h = 2;
+    deconv_param->stride_w = 2;
+    deconv_param->pad_h0 = 0;
+    deconv_param->pad_w0 = 0;
+    deconv_param->pad_h1 = 0;
+    deconv_param->pad_w1 = 0;
+    deconv_param->dilation_h = 1;
+    deconv_param->dilation_w = 1;
+    deconv_param->group = 1;
+    deconv_param->activation = -1;
+    deconv_param->output_pad_h0 = 0;
+    deconv_param->output_pad_w0 = 0;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[9] = {
+    3.0f,
+    8.0f,
+    1.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    3.0f,
+    2.0f,
+    3.0f,
+};
+
+float weight_fp32[9] = {
+    9.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    0.0f,
+    2.0f,
+};
+
+float reference_out[49] = {
+    27.000000,
+    0.000000,
+    81.000000,
+    0.000000,
+    33.000000,
+    0.000000,
+    3.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    84.000000,
+    0.000000,
+    86.000000,
+    0.000000,
+    95.000000,
+    0.000000,
+    23.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    36.000000,
+    0.000000,
+    50.000000,
+    0.000000,
+    50.000000,
+    0.000000,
+    23.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    3.000000,
+    0.000000,
+    8.000000,
+    0.000000,
+    7.000000,
+    0.000000,
+    6.000000,
+};
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 3, w = 3;
+    const char* test_node_name = "deconv";
+    int data_type = TENGINE_DT_FP32;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_deconv_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "deconv");
+
+    // set input data
+    set_tensor_buffer(input_tensor, input_fp32, 9 * 4);
+
+    // set weight data
+    set_tensor_buffer(weight_tensor, weight_fp32, 9 * 4);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    float* output_data = (float*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_tensorrt_op_dropout.cpp b/tests/op/test_tensorrt_op_dropout.cpp
index d9c53d416..51453b2e7 100644
--- a/tests/op/test_tensorrt_op_dropout.cpp
+++ b/tests/op/test_tensorrt_op_dropout.cpp
@@ -1,133 +1,144 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-
-
-int create_test_dropout_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Dropout");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[6] = {1.0f, 2.0f, 3.0f,
-                       4.0f, 5.0f, 6.0f, };
-
-float reference_out[6] = {1.0f, 2.0f, 3.0f,
-                          4.0f, 5.0f, 6.0f, };
-
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 2, h = 1, w = 3;
-    const char* test_node_name = "dropout";
-    int data_type = TENGINE_DT_FP32;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_dropout_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "dropout");
-
-
-    // set input data
-    set_tensor_buffer(input_tensor, input_fp32, 6 * 4);
-
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    float* output_data = ( float* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+
+int create_test_dropout_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Dropout");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[6] = {
+    1.0f,
+    2.0f,
+    3.0f,
+    4.0f,
+    5.0f,
+    6.0f,
+};
+
+float reference_out[6] = {
+    1.0f,
+    2.0f,
+    3.0f,
+    4.0f,
+    5.0f,
+    6.0f,
+};
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 2, h = 1, w = 3;
+    const char* test_node_name = "dropout";
+    int data_type = TENGINE_DT_FP32;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_dropout_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "dropout");
+
+    // set input data
+    set_tensor_buffer(input_tensor, input_fp32, 6 * 4);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    float* output_data = (float*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_tensorrt_op_eltwise.cpp b/tests/op/test_tensorrt_op_eltwise.cpp
index a59b99edd..3cf144c23 100644
--- a/tests/op/test_tensorrt_op_eltwise.cpp
+++ b/tests/op/test_tensorrt_op_eltwise.cpp
@@ -1,157 +1,186 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/eltwise_param.h"
-
-
-int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Eltwise");
-
-    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
-
-    if(NULL == input0_tensor)
-    {
-        fprintf(stderr, "create test node input0 failed.\n");
-        return -1;
-    }
-
-    node_t input1_node = create_graph_node(graph, "input1", "InputOp");
-    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_FP32);
-    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_INPUT);
-    int input1_dims[4] = {1, 1, 3, 3};  // channel num
-    set_tensor_shape(input1_tensor, input1_dims, 4);
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input0_tensor);
-    set_node_input_tensor(test_node, 1, input1_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct eltwise_param* param = ( struct eltwise_param* )(struct node* )test_node->op.param_mem;
-
-    param->type = 2;
-    param->caffe_flavor = 1;
-    param->shift = NULL;
-    param->power = NULL;
-    param->scale = NULL;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,};
-
-float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,};
-
-float reference_out[9] = {12.0f, 8.0f, 4.0f, 9.0f, 5.0f, 7.0f, 4.0f, 2.0f, 5.0f,};
-
-
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 3, w = 3;
-    const char* test_node_name = "eltwise";
-    int data_type = TENGINE_DT_FP32;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise");
-
-
-    // set input data
-    set_tensor_buffer(input0_tensor, input0_fp32, 9 * 4);
-
-    // set input data
-    set_tensor_buffer(input1_tensor, input1_fp32, 9 * 4);
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    float* output_data = ( float* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/eltwise_param.h"
+
+int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Eltwise");
+
+    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
+
+    if (NULL == input0_tensor)
+    {
+        fprintf(stderr, "create test node input0 failed.\n");
+        return -1;
+    }
+
+    node_t input1_node = create_graph_node(graph, "input1", "InputOp");
+    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_FP32);
+    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_INPUT);
+    int input1_dims[4] = {1, 1, 3, 3}; // channel num
+    set_tensor_shape(input1_tensor, input1_dims, 4);
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input0_tensor);
+    set_node_input_tensor(test_node, 1, input1_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct eltwise_param* param = (struct eltwise_param*)(struct node*)test_node->op.param_mem;
+
+    param->type = 2;
+    param->caffe_flavor = 1;
+    param->shift = NULL;
+    param->power = NULL;
+    param->scale = NULL;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input0_fp32[9] = {
+    3.0f,
+    8.0f,
+    1.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    3.0f,
+    2.0f,
+    3.0f,
+};
+
+float input1_fp32[9] = {
+    9.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    0.0f,
+    2.0f,
+};
+
+float reference_out[9] = {
+    12.0f,
+    8.0f,
+    4.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    4.0f,
+    2.0f,
+    5.0f,
+};
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 3, w = 3;
+    const char* test_node_name = "eltwise";
+    int data_type = TENGINE_DT_FP32;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise");
+
+    // set input data
+    set_tensor_buffer(input0_tensor, input0_fp32, 9 * 4);
+
+    // set input data
+    set_tensor_buffer(input1_tensor, input1_fp32, 9 * 4);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    float* output_data = (float*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_tensorrt_op_fc.cpp b/tests/op/test_tensorrt_op_fc.cpp
index 6da5459e2..be4597531 100644
--- a/tests/op/test_tensorrt_op_fc.cpp
+++ b/tests/op/test_tensorrt_op_fc.cpp
@@ -1,152 +1,163 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/fc_param.h"
-
-
-int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "FullyConnected");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
-    /* weight */
-    node_t weight_node = create_graph_node(graph, "weight", "Const");
-    tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_FP32);
-    set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST);
-    int weight_dims[2] = {1, 3};  // channel num
-    set_tensor_shape(weight_tensor, weight_dims, 2);
-
-    /* bias */
-    // node_t bias_node = create_graph_node(graph, "bias", "Const");
-    // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32);
-    // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST);
-    // int bias_dims[1] = {1};  // channel num
-    // set_tensor_shape(bias_tensor, bias_dims, 1);
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-    set_node_input_tensor(test_node, 1, weight_tensor);
-    // set_node_input_tensor(test_node, 2, bias_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct fc_param* param = ( struct fc_param* )(struct node* )test_node->op.param_mem;
-
-    param->num_output = 1;
-
-    return 0;
-}
-
-
-float input_fp32[3] = {3.0f, 8.0f, 1.0f,};
-
-float weight_fp32[3] = {9.0f, 0.0f, 3.0f,};
-
-float reference_out[1] = {30,};
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 3, h = 1, w = 1;
-    const char* test_node_name = "conv";
-    int data_type = TENGINE_DT_FP32;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv");
-
-    // set input data
-    set_tensor_buffer(input_tensor, input_fp32, 3 * 4);
-
-    // set weight data
-    set_tensor_buffer(weight_tensor, weight_fp32, 3 * 4);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    float* output_data = ( float* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/fc_param.h"
+
+int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "FullyConnected");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
+    /* weight */
+    node_t weight_node = create_graph_node(graph, "weight", "Const");
+    tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_FP32);
+    set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST);
+    int weight_dims[2] = {1, 3}; // channel num
+    set_tensor_shape(weight_tensor, weight_dims, 2);
+
+    /* bias */
+    // node_t bias_node = create_graph_node(graph, "bias", "Const");
+    // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32);
+    // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST);
+    // int bias_dims[1] = {1};  // channel num
+    // set_tensor_shape(bias_tensor, bias_dims, 1);
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+    set_node_input_tensor(test_node, 1, weight_tensor);
+    // set_node_input_tensor(test_node, 2, bias_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct fc_param* param = (struct fc_param*)(struct node*)test_node->op.param_mem;
+
+    param->num_output = 1;
+
+    return 0;
+}
+
+float input_fp32[3] = {
+    3.0f,
+    8.0f,
+    1.0f,
+};
+
+float weight_fp32[3] = {
+    9.0f,
+    0.0f,
+    3.0f,
+};
+
+float reference_out[1] = {
+    30,
+};
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 3, h = 1, w = 1;
+    const char* test_node_name = "conv";
+    int data_type = TENGINE_DT_FP32;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_tensorrt_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv");
+
+    // set input data
+    set_tensor_buffer(input_tensor, input_fp32, 3 * 4);
+
+    // set weight data
+    set_tensor_buffer(weight_tensor, weight_fp32, 3 * 4);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    float* output_data = (float*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_clip.cpp b/tests/op/test_timvx_op_clip.cpp
index c3ea778d2..4e82c2f71 100644
--- a/tests/op/test_timvx_op_clip.cpp
+++ b/tests/op/test_timvx_op_clip.cpp
@@ -22,20 +22,22 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 
-
 int create_test_clip_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Clip");
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Clip");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -78,7 +80,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_clip_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -101,7 +103,7 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
@@ -109,13 +111,13 @@ int main(int argc, char* argv[])
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     std::vector<float> output_data(output_size);
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data.data() + i * cstep;
+        float* output_value = (float*)output_data.data() + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.01f)
diff --git a/tests/op/test_timvx_op_concat.cpp b/tests/op/test_timvx_op_concat.cpp
index 05e9e7a67..2a9eeebde 100644
--- a/tests/op/test_timvx_op_concat.cpp
+++ b/tests/op/test_timvx_op_concat.cpp
@@ -1,187 +1,225 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/concat_param.h"
-
-
-int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Concat");
-
-    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
-
-    if(NULL == input0_tensor)
-    {
-        fprintf(stderr, "create test node input0 failed.\n");
-        return -1;
-    }
-
-    node_t input1_node = create_graph_node(graph, "input1", "Const");
-    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8);
-    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST);
-    int input1_dims[4] = {1, 1, 3, 3};  // channel num
-    set_tensor_shape(input1_tensor, input1_dims, 4);
-
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input0_tensor);
-    set_node_input_tensor(test_node, 1, input1_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct concat_param* param = ( struct concat_param* )(struct node* )test_node->op.param_mem;
-
-    param->axis = 1;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,};
-float input0_scale = 1;
-int input0_zero_point = 0;
-
-float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,};
-float input1_scale = 1;
-int input1_zero_point = 0;
-
-float reference_out[18] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,
-                           9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 3, w = 3;
-    const char* test_node_name = "concat";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "concat");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1);
-    set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input0_u8[9] = {0};
-    get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point);
-    set_tensor_buffer(input0_tensor, input0_u8, 9);
-
-    // set input data
-    uint8_t input1_u8[9] = {0};
-    get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point);
-    set_tensor_buffer(input1_tensor, input1_u8, 9);
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/concat_param.h"
+
+int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Concat");
+
+    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
+
+    if (NULL == input0_tensor)
+    {
+        fprintf(stderr, "create test node input0 failed.\n");
+        return -1;
+    }
+
+    node_t input1_node = create_graph_node(graph, "input1", "Const");
+    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8);
+    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST);
+    int input1_dims[4] = {1, 1, 3, 3}; // channel num
+    set_tensor_shape(input1_tensor, input1_dims, 4);
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input0_tensor);
+    set_node_input_tensor(test_node, 1, input1_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct concat_param* param = (struct concat_param*)(struct node*)test_node->op.param_mem;
+
+    param->axis = 1;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input0_fp32[9] = {
+    3.0f,
+    8.0f,
+    1.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    3.0f,
+    2.0f,
+    3.0f,
+};
+float input0_scale = 1;
+int input0_zero_point = 0;
+
+float input1_fp32[9] = {
+    9.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    0.0f,
+    2.0f,
+};
+float input1_scale = 1;
+int input1_zero_point = 0;
+
+float reference_out[18] = {
+    3.0f,
+    8.0f,
+    1.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    3.0f,
+    2.0f,
+    3.0f,
+    9.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    0.0f,
+    2.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 3, w = 3;
+    const char* test_node_name = "concat";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "concat");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1);
+    set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input0_u8[9] = {0};
+    get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point);
+    set_tensor_buffer(input0_tensor, input0_u8, 9);
+
+    // set input data
+    uint8_t input1_u8[9] = {0};
+    get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point);
+    set_tensor_buffer(input1_tensor, input1_u8, 9);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_convolution.cpp b/tests/op/test_timvx_op_convolution.cpp
index d5cd6f86b..da10249ab 100644
--- a/tests/op/test_timvx_op_convolution.cpp
+++ b/tests/op/test_timvx_op_convolution.cpp
@@ -22,7 +22,6 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 
 #include "graph/graph.h"
@@ -30,17 +29,20 @@
 #include "graph/tensor.h"
 #include "operator/prototype/convolution_param.h"
 
-
 int create_test_convolution_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Convolution");
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Convolution");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -51,7 +53,7 @@ int create_test_convolution_node(graph_t graph, const char* input_name, const ch
     node_t weight_node = create_graph_node(graph, "weight", "Const");
     tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8);
     set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST);
-    int weight_dims[4] = {1, 1, 3, 3};  // channel num
+    int weight_dims[4] = {1, 1, 3, 3}; // channel num
     set_tensor_shape(weight_tensor, weight_dims, 4);
 
     /* bias */
@@ -59,7 +61,7 @@ int create_test_convolution_node(graph_t graph, const char* input_name, const ch
     // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32);
     // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST);
     // int bias_dims[1] = {1};  // channel num
-    // set_tensor_shape(bias_tensor, bias_dims, 1); 
+    // set_tensor_shape(bias_tensor, bias_dims, 1);
 
     /* input tensors of test node */
     set_node_input_tensor(test_node, 0, input_tensor);
@@ -71,7 +73,7 @@ int create_test_convolution_node(graph_t graph, const char* input_name, const ch
     set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
 
     /* set params */
-    struct conv_param* conv_param = ( struct conv_param* )(struct node* )test_node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)(struct node*)test_node->op.param_mem;
 
     conv_param->kernel_h = 3;
     conv_param->kernel_w = 3;
@@ -98,8 +100,8 @@ int create_test_convolution_node(graph_t graph, const char* input_name, const ch
  * float32 = (uint8 - zero_point) * scale
  */
 float input_fp32[9] = {-3, -2, 1,
-                        1,  0, 2,
-                        1,  1, 1};
+                       1, 0, 2,
+                       1, 1, 1};
 float input_scale = 0.0196078f;
 int input_zero_point = 153;
 
@@ -110,12 +112,11 @@ float weight_scale = 0.0039216f;
 int weight_zero_point = 0;
 
 float reference_out[9] = {-4, -1, 1,
-                          -2,  2, 3,
-                           3,  6, 4};
+                          -2, 2, 3,
+                          3, 6, 4};
 float output_scale = 0.03921568f;
 int output_zero_point = 102;
 
-
 void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
 {
     for (int i = 0; i < size; i++)
@@ -143,8 +144,8 @@ int main(int argc, char* argv[])
         fprintf(stderr, "Tengine init failed.\n");
 
     // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_convolution_node);
-    if(NULL == ir_graph)
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_convolution_node);
+    if (NULL == ir_graph)
         return -1;
 
     set_log_level(LOG_INFO);
@@ -155,7 +156,7 @@ int main(int argc, char* argv[])
     struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight");
     struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv");
 
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
     set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
     set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1);
     set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
@@ -183,17 +184,17 @@ int main(int argc, char* argv[])
     }
 
     // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
 
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< output_size; i++)
+    for (int i = 0; i < output_size; i++)
     {
         if (fabsf(output_data[i] - reference_out[i]) > 0.1)
         {
diff --git a/tests/op/test_timvx_op_deconv.cpp b/tests/op/test_timvx_op_deconv.cpp
index 15f6cdfd5..2013f37ea 100644
--- a/tests/op/test_timvx_op_deconv.cpp
+++ b/tests/op/test_timvx_op_deconv.cpp
@@ -1,257 +1,280 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/deconv_param.h"
-
-
-int create_test_deconv_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Deconvolution");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
-    /* weight */
-    node_t weight_node = create_graph_node(graph, "weight", "Const");
-    tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8);
-    set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST);
-    int weight_dims[4] = {1, 1, 3, 3};  // channel num
-    set_tensor_shape(weight_tensor, weight_dims, 4);
-
-    /* bias */
-    // node_t bias_node = create_graph_node(graph, "bias", "Const");
-    // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32);
-    // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST);
-    // int bias_dims[1] = {1};  // channel num
-    // set_tensor_shape(bias_tensor, bias_dims, 1);
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-    set_node_input_tensor(test_node, 1, weight_tensor);
-    // set_node_input_tensor(test_node, 2, bias_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct deconv_param* deconv_param = ( struct deconv_param* )(struct node* )test_node->op.param_mem;
-
-    deconv_param->num_output = 1;
-    deconv_param->kernel_h = 3;
-    deconv_param->kernel_w = 3;
-    deconv_param->stride_h = 2;
-    deconv_param->stride_w = 2;
-    deconv_param->pad_h0 = 0;
-    deconv_param->pad_w0 = 0;
-    deconv_param->pad_h1 = 0;
-    deconv_param->pad_w1 = 0;
-    deconv_param->dilation_h = 1;
-    deconv_param->dilation_w = 1;
-    deconv_param->group = 1;
-    deconv_param->activation = -1;
-    deconv_param->output_pad_h0 = 0;
-    deconv_param->output_pad_w0 = 0;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,};
-float input_scale = 1;
-int input_zero_point = 0;
-
-float weight_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,};
-float weight_scale = 1;
-int weight_zero_point = 0;
-
-float reference_out[49] = {27.000000,
-                           0.000000,
-                           81.000000,
-                           0.000000,
-                           33.000000,
-                           0.000000,
-                           3.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           84.000000,
-                           0.000000,
-                           86.000000,
-                           0.000000,
-                           95.000000,
-                           0.000000,
-                           23.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           36.000000,
-                           0.000000,
-                           50.000000,
-                           0.000000,
-                           50.000000,
-                           0.000000,
-                           23.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           0.000000,
-                           3.000000,
-                           0.000000,
-                           8.000000,
-                           0.000000,
-                           7.000000,
-                           0.000000,
-                           6.000000, };
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 3, w = 3;
-    const char* test_node_name = "deconv";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_deconv_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "deconv");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[9] = {0};
-    get_uint8_data(input_fp32, input_u8, 9, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 9);
-
-    // set weight data
-    uint8_t weight_u8[9] = {0};
-    get_uint8_data(weight_fp32, weight_u8, 9, weight_scale, weight_zero_point);
-    set_tensor_buffer(weight_tensor, weight_u8, 9);
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/deconv_param.h"
+
+int create_test_deconv_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Deconvolution");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
+    /* weight */
+    node_t weight_node = create_graph_node(graph, "weight", "Const");
+    tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8);
+    set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST);
+    int weight_dims[4] = {1, 1, 3, 3}; // channel num
+    set_tensor_shape(weight_tensor, weight_dims, 4);
+
+    /* bias */
+    // node_t bias_node = create_graph_node(graph, "bias", "Const");
+    // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32);
+    // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST);
+    // int bias_dims[1] = {1};  // channel num
+    // set_tensor_shape(bias_tensor, bias_dims, 1);
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+    set_node_input_tensor(test_node, 1, weight_tensor);
+    // set_node_input_tensor(test_node, 2, bias_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct deconv_param* deconv_param = (struct deconv_param*)(struct node*)test_node->op.param_mem;
+
+    deconv_param->num_output = 1;
+    deconv_param->kernel_h = 3;
+    deconv_param->kernel_w = 3;
+    deconv_param->stride_h = 2;
+    deconv_param->stride_w = 2;
+    deconv_param->pad_h0 = 0;
+    deconv_param->pad_w0 = 0;
+    deconv_param->pad_h1 = 0;
+    deconv_param->pad_w1 = 0;
+    deconv_param->dilation_h = 1;
+    deconv_param->dilation_w = 1;
+    deconv_param->group = 1;
+    deconv_param->activation = -1;
+    deconv_param->output_pad_h0 = 0;
+    deconv_param->output_pad_w0 = 0;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[9] = {
+    3.0f,
+    8.0f,
+    1.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    3.0f,
+    2.0f,
+    3.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float weight_fp32[9] = {
+    9.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    0.0f,
+    2.0f,
+};
+float weight_scale = 1;
+int weight_zero_point = 0;
+
+float reference_out[49] = {
+    27.000000,
+    0.000000,
+    81.000000,
+    0.000000,
+    33.000000,
+    0.000000,
+    3.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    84.000000,
+    0.000000,
+    86.000000,
+    0.000000,
+    95.000000,
+    0.000000,
+    23.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    36.000000,
+    0.000000,
+    50.000000,
+    0.000000,
+    50.000000,
+    0.000000,
+    23.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    0.000000,
+    3.000000,
+    0.000000,
+    8.000000,
+    0.000000,
+    7.000000,
+    0.000000,
+    6.000000,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 3, w = 3;
+    const char* test_node_name = "deconv";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_deconv_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "deconv");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[9] = {0};
+    get_uint8_data(input_fp32, input_u8, 9, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 9);
+
+    // set weight data
+    uint8_t weight_u8[9] = {0};
+    get_uint8_data(weight_fp32, weight_u8, 9, weight_scale, weight_zero_point);
+    set_tensor_buffer(weight_tensor, weight_u8, 9);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_dropout.cpp b/tests/op/test_timvx_op_dropout.cpp
index ac991bf6b..85f7e29ad 100644
--- a/tests/op/test_timvx_op_dropout.cpp
+++ b/tests/op/test_timvx_op_dropout.cpp
@@ -22,20 +22,22 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 
-
 int create_test_dropout_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
     node_t test_node = create_graph_node(graph, node_name, "Dropout");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -78,7 +80,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_dropout_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -101,21 +103,21 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
 
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data + i * cstep;
+        float* output_value = (float*)output_data + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.01)
diff --git a/tests/op/test_timvx_op_eltwise_mul.cpp b/tests/op/test_timvx_op_eltwise_mul.cpp
index 8f464d3b6..a2af5a610 100644
--- a/tests/op/test_timvx_op_eltwise_mul.cpp
+++ b/tests/op/test_timvx_op_eltwise_mul.cpp
@@ -1,189 +1,220 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/eltwise_param.h"
-
-
-int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Eltwise");
-
-    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
-
-    if(NULL == input0_tensor)
-    {
-        fprintf(stderr, "create test node input0 failed.\n");
-        return -1;
-    }
-
-    node_t input1_node = create_graph_node(graph, "input1", "Const");
-    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8);
-    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST);
-    int input1_dims[4] = {1, 1, 3, 3};  // channel num
-    set_tensor_shape(input1_tensor, input1_dims, 4);
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input0_tensor);
-    set_node_input_tensor(test_node, 1, input1_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct eltwise_param* param = ( struct eltwise_param* )(struct node* )test_node->op.param_mem;
-
-    param->type = 0;
-    param->caffe_flavor = 1;
-    param->shift = NULL;
-    param->power = NULL;
-    param->scale = NULL;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,};
-float input0_scale = 1;
-int input0_zero_point = 0;
-
-float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,};
-float input1_scale = 1;
-int input1_zero_point = 0;
-
-float reference_out[9] = {27.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 3.0f, 0.0f, 6.0f,};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 3, w = 3;
-    const char* test_node_name = "eltwise";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1);
-    set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input0_u8[9] = {0};
-    get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point);
-    set_tensor_buffer(input0_tensor, input0_u8, 9);
-
-    // set input data
-    uint8_t input1_u8[9] = {0};
-    get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point);
-    set_tensor_buffer(input1_tensor, input1_u8, 9);
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/eltwise_param.h"
+
+int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Eltwise");
+
+    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
+
+    if (NULL == input0_tensor)
+    {
+        fprintf(stderr, "create test node input0 failed.\n");
+        return -1;
+    }
+
+    node_t input1_node = create_graph_node(graph, "input1", "Const");
+    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8);
+    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST);
+    int input1_dims[4] = {1, 1, 3, 3}; // channel num
+    set_tensor_shape(input1_tensor, input1_dims, 4);
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input0_tensor);
+    set_node_input_tensor(test_node, 1, input1_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct eltwise_param* param = (struct eltwise_param*)(struct node*)test_node->op.param_mem;
+
+    param->type = 0;
+    param->caffe_flavor = 1;
+    param->shift = NULL;
+    param->power = NULL;
+    param->scale = NULL;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input0_fp32[9] = {
+    3.0f,
+    8.0f,
+    1.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    3.0f,
+    2.0f,
+    3.0f,
+};
+float input0_scale = 1;
+int input0_zero_point = 0;
+
+float input1_fp32[9] = {
+    9.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    0.0f,
+    2.0f,
+};
+float input1_scale = 1;
+int input1_zero_point = 0;
+
+float reference_out[9] = {
+    27.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    6.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 3, w = 3;
+    const char* test_node_name = "eltwise";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1);
+    set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input0_u8[9] = {0};
+    get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point);
+    set_tensor_buffer(input0_tensor, input0_u8, 9);
+
+    // set input data
+    uint8_t input1_u8[9] = {0};
+    get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point);
+    set_tensor_buffer(input1_tensor, input1_u8, 9);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_eltwise_sum.cpp b/tests/op/test_timvx_op_eltwise_sum.cpp
index 511aebb27..fb38ca449 100644
--- a/tests/op/test_timvx_op_eltwise_sum.cpp
+++ b/tests/op/test_timvx_op_eltwise_sum.cpp
@@ -1,189 +1,220 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/eltwise_param.h"
-
-
-int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Eltwise");
-
-    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
-
-    if(NULL == input0_tensor)
-    {
-        fprintf(stderr, "create test node input0 failed.\n");
-        return -1;
-    }
-
-    node_t input1_node = create_graph_node(graph, "input1", "Const");
-    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8);
-    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST);
-    int input1_dims[4] = {1, 1, 3, 3};  // channel num
-    set_tensor_shape(input1_tensor, input1_dims, 4);
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input0_tensor);
-    set_node_input_tensor(test_node, 1, input1_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct eltwise_param* param = ( struct eltwise_param* )(struct node* )test_node->op.param_mem;
-
-    param->type = 2;
-    param->caffe_flavor = 1;
-    param->shift = NULL;
-    param->power = NULL;
-    param->scale = NULL;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input0_fp32[9] = {3.0f, 8.0f, 1.0f, 9.0f, 5.0f, 7.0f, 3.0f, 2.0f, 3.0f,};
-float input0_scale = 1;
-int input0_zero_point = 0;
-
-float input1_fp32[9] = {9.0f, 0.0f, 3.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 2.0f,};
-float input1_scale = 1;
-int input1_zero_point = 0;
-
-float reference_out[9] = {12.0f, 8.0f, 4.0f, 9.0f, 5.0f, 7.0f, 4.0f, 2.0f, 5.0f,};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 3, w = 3;
-    const char* test_node_name = "eltwise";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1);
-    set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input0_u8[9] = {0};
-    get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point);
-    set_tensor_buffer(input0_tensor, input0_u8, 9);
-
-    // set input data
-    uint8_t input1_u8[9] = {0};
-    get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point);
-    set_tensor_buffer(input1_tensor, input1_u8, 9);
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/eltwise_param.h"
+
+int create_test_concat_node(graph_t graph, const char* input_name0, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Eltwise");
+
+    tensor_t input0_tensor = get_graph_tensor(graph, input_name0);
+
+    if (NULL == input0_tensor)
+    {
+        fprintf(stderr, "create test node input0 failed.\n");
+        return -1;
+    }
+
+    node_t input1_node = create_graph_node(graph, "input1", "Const");
+    tensor_t input1_tensor = create_graph_tensor(graph, "input1", TENGINE_DT_UINT8);
+    set_node_output_tensor(input1_node, 0, input1_tensor, TENSOR_TYPE_CONST);
+    int input1_dims[4] = {1, 1, 3, 3}; // channel num
+    set_tensor_shape(input1_tensor, input1_dims, 4);
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input0_tensor);
+    set_node_input_tensor(test_node, 1, input1_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct eltwise_param* param = (struct eltwise_param*)(struct node*)test_node->op.param_mem;
+
+    param->type = 2;
+    param->caffe_flavor = 1;
+    param->shift = NULL;
+    param->power = NULL;
+    param->scale = NULL;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input0_fp32[9] = {
+    3.0f,
+    8.0f,
+    1.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    3.0f,
+    2.0f,
+    3.0f,
+};
+float input0_scale = 1;
+int input0_zero_point = 0;
+
+float input1_fp32[9] = {
+    9.0f,
+    0.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    0.0f,
+    2.0f,
+};
+float input1_scale = 1;
+int input1_zero_point = 0;
+
+float reference_out[9] = {
+    12.0f,
+    8.0f,
+    4.0f,
+    9.0f,
+    5.0f,
+    7.0f,
+    4.0f,
+    2.0f,
+    5.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 3, w = 3;
+    const char* test_node_name = "eltwise";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_concat_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input0_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* input1_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input1");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "eltwise");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input0_tensor, &input0_scale, &input0_zero_point, 1);
+    set_tensor_quant_param(input1_tensor, &input1_scale, &input1_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input0_u8[9] = {0};
+    get_uint8_data(input0_fp32, input0_u8, 9, input0_scale, input0_zero_point);
+    set_tensor_buffer(input0_tensor, input0_u8, 9);
+
+    // set input data
+    uint8_t input1_u8[9] = {0};
+    get_uint8_data(input1_fp32, input1_u8, 9, input1_scale, input1_zero_point);
+    set_tensor_buffer(input1_tensor, input1_u8, 9);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_elu.cpp b/tests/op/test_timvx_op_elu.cpp
index f421fdc53..39f82fac2 100644
--- a/tests/op/test_timvx_op_elu.cpp
+++ b/tests/op/test_timvx_op_elu.cpp
@@ -22,20 +22,22 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 
-
 int create_test_elu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Elu");
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Elu");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -78,7 +80,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_elu_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -101,7 +103,7 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
@@ -109,13 +111,13 @@ int main(int argc, char* argv[])
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     std::vector<float> output_data(output_size);
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data.data() + i * cstep;
+        float* output_value = (float*)output_data.data() + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.05f)
diff --git a/tests/op/test_timvx_op_fc.cpp b/tests/op/test_timvx_op_fc.cpp
index 0cb721c8c..080fd0af8 100644
--- a/tests/op/test_timvx_op_fc.cpp
+++ b/tests/op/test_timvx_op_fc.cpp
@@ -1,195 +1,206 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/fc_param.h"
-
-
-int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "FullyConnected");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
-    /* weight */
-    node_t weight_node = create_graph_node(graph, "weight", "Const");
-    tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8);
-    set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST);
-    int weight_dims[2] = {1, 3};  // channel num
-    set_tensor_shape(weight_tensor, weight_dims, 2);
-
-    /* bias */
-    // node_t bias_node = create_graph_node(graph, "bias", "Const");
-    // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32);
-    // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST);
-    // int bias_dims[1] = {1};  // channel num
-    // set_tensor_shape(bias_tensor, bias_dims, 1);
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-    set_node_input_tensor(test_node, 1, weight_tensor);
-    // set_node_input_tensor(test_node, 2, bias_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct fc_param* param = ( struct fc_param* )(struct node* )test_node->op.param_mem;
-
-    param->num_output = 1;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[3] = {3.0f, 8.0f, 1.0f,};
-float input_scale = 1;
-int input_zero_point = 0;
-
-float weight_fp32[3] = {9.0f, 0.0f, 3.0f,};
-float weight_scale = 1;
-int weight_zero_point = 0;
-
-float reference_out[1] = {30,};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 3, h = 1, w = 1;
-    const char* test_node_name = "conv";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[3] = {0};
-    get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 3);
-
-    // set weight data
-    uint8_t weight_u8[3] = {0};
-    get_uint8_data(weight_fp32, weight_u8, 3, weight_scale, weight_zero_point);
-    set_tensor_buffer(weight_tensor, weight_u8, 3);
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/fc_param.h"
+
+int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "FullyConnected");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
+    /* weight */
+    node_t weight_node = create_graph_node(graph, "weight", "Const");
+    tensor_t weight_tensor = create_graph_tensor(graph, "weight", TENGINE_DT_UINT8);
+    set_node_output_tensor(weight_node, 0, weight_tensor, TENSOR_TYPE_CONST);
+    int weight_dims[2] = {1, 3}; // channel num
+    set_tensor_shape(weight_tensor, weight_dims, 2);
+
+    /* bias */
+    // node_t bias_node = create_graph_node(graph, "bias", "Const");
+    // tensor_t bias_tensor = create_graph_tensor(graph, "bias", TENGINE_DT_INT32);
+    // set_node_output_tensor(bias_node, 0, bias_tensor, TENSOR_TYPE_CONST);
+    // int bias_dims[1] = {1};  // channel num
+    // set_tensor_shape(bias_tensor, bias_dims, 1);
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+    set_node_input_tensor(test_node, 1, weight_tensor);
+    // set_node_input_tensor(test_node, 2, bias_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct fc_param* param = (struct fc_param*)(struct node*)test_node->op.param_mem;
+
+    param->num_output = 1;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[3] = {
+    3.0f,
+    8.0f,
+    1.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float weight_fp32[3] = {
+    9.0f,
+    0.0f,
+    3.0f,
+};
+float weight_scale = 1;
+int weight_zero_point = 0;
+
+float reference_out[1] = {
+    30,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 3, h = 1, w = 1;
+    const char* test_node_name = "conv";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* weight_tensor = (struct tensor*)get_graph_tensor(ir_graph, "weight");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "conv");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(weight_tensor, &weight_scale, &weight_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[3] = {0};
+    get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 3);
+
+    // set weight data
+    uint8_t weight_u8[3] = {0};
+    get_uint8_data(weight_fp32, weight_u8, 3, weight_scale, weight_zero_point);
+    set_tensor_buffer(weight_tensor, weight_u8, 3);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_flatten.cpp b/tests/op/test_timvx_op_flatten.cpp
index 0f960b8e1..fc0ff8e64 100644
--- a/tests/op/test_timvx_op_flatten.cpp
+++ b/tests/op/test_timvx_op_flatten.cpp
@@ -1,169 +1,177 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/flatten_param.h"
-
-
-int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Flatten");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct flatten_param* param = ( struct flatten_param* )(struct node* )test_node->op.param_mem;
-
-    param->axis = 1;
-    param->end_axis = 3;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[3] = {3.0f, 8.0f, 1.0f,};
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[3] = {3.0f, 8.0f, 1.0f,};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 3, h = 1, w = 1;
-    const char* test_node_name = "flatten";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "flatten");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[3] = {0};
-    get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 3);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/flatten_param.h"
+
+int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Flatten");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct flatten_param* param = (struct flatten_param*)(struct node*)test_node->op.param_mem;
+
+    param->axis = 1;
+    param->end_axis = 3;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[3] = {
+    3.0f,
+    8.0f,
+    1.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[3] = {
+    3.0f,
+    8.0f,
+    1.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 3, h = 1, w = 1;
+    const char* test_node_name = "flatten";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "flatten");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[3] = {0};
+    get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 3);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_gather.cpp b/tests/op/test_timvx_op_gather.cpp
index cce32b477..c4c54690d 100644
--- a/tests/op/test_timvx_op_gather.cpp
+++ b/tests/op/test_timvx_op_gather.cpp
@@ -1,174 +1,182 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/gather_param.h"
-
-
-int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Gather");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct gather_param* param = ( struct gather_param* )(struct node* )test_node->op.param_mem;
-
-    param->axis = 1;
-    param->is_onnx = 1;
-    param->indices_num = 2;
-
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[6] = {3.0f, 7.0f,
-                       2.0f, 1.0f,
-                       4.0f, 6.0f,};
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[4] = {3.0f, 7.0f,
-                          2.0f, 1.0f,};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 3, h = 2, w = 1;
-    const char* test_node_name = "gather";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "gather");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[6] = {0};
-    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 6);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/gather_param.h"
+
+int create_test_fc_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Gather");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct gather_param* param = (struct gather_param*)(struct node*)test_node->op.param_mem;
+
+    param->axis = 1;
+    param->is_onnx = 1;
+    param->indices_num = 2;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[6] = {
+    3.0f,
+    7.0f,
+    2.0f,
+    1.0f,
+    4.0f,
+    6.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[4] = {
+    3.0f,
+    7.0f,
+    2.0f,
+    1.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 3, h = 2, w = 1;
+    const char* test_node_name = "gather";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_fc_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "gather");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[6] = {0};
+    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 6);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_hardswish.cpp b/tests/op/test_timvx_op_hardswish.cpp
index ae0f95fe1..e3df15ed6 100644
--- a/tests/op/test_timvx_op_hardswish.cpp
+++ b/tests/op/test_timvx_op_hardswish.cpp
@@ -22,20 +22,22 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 
-
 int create_test_hardswish_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Hardswish");
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Hardswish");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -78,7 +80,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_hardswish_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -101,7 +103,7 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
@@ -109,13 +111,13 @@ int main(int argc, char* argv[])
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     std::vector<float> output_data(output_size);
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data.data() + i * cstep;
+        float* output_value = (float*)output_data.data() + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.01f)
diff --git a/tests/op/test_timvx_op_interp.cpp b/tests/op/test_timvx_op_interp.cpp
index 625fe8b57..3a226fb10 100644
--- a/tests/op/test_timvx_op_interp.cpp
+++ b/tests/op/test_timvx_op_interp.cpp
@@ -1,175 +1,189 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/interp_param.h"
-
-
-int create_test_interp_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Interp");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct interp_param* param = ( struct interp_param* )(struct node* )test_node->op.param_mem;
-
-    param->resize_type = 1;
-    param->output_height = 2;
-    param->output_width = 2;
-    param->height_scale = 0.5;
-    param->width_scale = 0.5;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[16] = {1.0f, 1.0f, 1.0f, 1.0f,
-                        1.0f, 2.0f, 2.0f, 1.0f,
-                        1.0f, 2.0f, 2.0f, 1.0f,
-                        1.0f, 1.0f, 1.0f, 1.0f, };
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[4] = {1, 1, 1, 2};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 4, w = 4;
-    const char* test_node_name = "interp";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_interp_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "interp");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[16] = {0};
-    get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 16);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/interp_param.h"
+
+int create_test_interp_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Interp");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct interp_param* param = (struct interp_param*)(struct node*)test_node->op.param_mem;
+
+    param->resize_type = 1;
+    param->output_height = 2;
+    param->output_width = 2;
+    param->height_scale = 0.5;
+    param->width_scale = 0.5;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[16] = {
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    2.0f,
+    2.0f,
+    1.0f,
+    1.0f,
+    2.0f,
+    2.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[4] = {1, 1, 1, 2};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 4, w = 4;
+    const char* test_node_name = "interp";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_interp_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "interp");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[16] = {0};
+    get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 16);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_leakyrelu.cpp b/tests/op/test_timvx_op_leakyrelu.cpp
index 3f69a25ac..6a037ef5a 100644
--- a/tests/op/test_timvx_op_leakyrelu.cpp
+++ b/tests/op/test_timvx_op_leakyrelu.cpp
@@ -22,21 +22,23 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 #include "operator/prototype/relu_param.h"
 
-
 int create_test_leakyrelu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "ReLU");
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "ReLU");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -50,7 +52,7 @@ int create_test_leakyrelu_node(graph_t graph, const char* input_name, const char
     set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
 
     /* set params */
-    struct relu_param* relu_param = ( struct relu_param* )(struct node* )test_node->op.param_mem;
+    struct relu_param* relu_param = (struct relu_param*)(struct node*)test_node->op.param_mem;
     relu_param->negative_slope = 0.1f;
 
     return 0;
@@ -83,7 +85,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_leakyrelu_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -106,7 +108,7 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
@@ -114,13 +116,13 @@ int main(int argc, char* argv[])
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     std::vector<float> output_data(output_size);
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data.data() + i * cstep;
+        float* output_value = (float*)output_data.data() + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.05f)
diff --git a/tests/op/test_timvx_op_mish.cpp b/tests/op/test_timvx_op_mish.cpp
index bcbe67629..31aa6966a 100644
--- a/tests/op/test_timvx_op_mish.cpp
+++ b/tests/op/test_timvx_op_mish.cpp
@@ -25,17 +25,20 @@
 #include <vector>
 #include "test_op.h"
 
-
 int create_test_mish_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
     node_t test_node = create_graph_node(graph, node_name, "Mish");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -78,7 +81,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_mish_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -101,22 +104,22 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
 
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     std::vector<float> output_data(output_size);
-    
+
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data.data() + i * cstep;
+        float* output_value = (float*)output_data.data() + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.01)
diff --git a/tests/op/test_timvx_op_permute.cpp b/tests/op/test_timvx_op_permute.cpp
index 8232f6cab..6848c1588 100644
--- a/tests/op/test_timvx_op_permute.cpp
+++ b/tests/op/test_timvx_op_permute.cpp
@@ -1,175 +1,186 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/permute_param.h"
-
-
-int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Permute");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct permute_param* param = ( struct permute_param* )(struct node* )test_node->op.param_mem;
-
-    param->flag = 0;
-    param->order0 = 0;
-    param->order1 = 2;
-    param->order2 = 3;
-    param->order3 = 1;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[6] = {1.0f, 2.0f, 3.0f,
-                        4.0f, 5.0f, 6.0f, };
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[6] = {1.0f, 4.0f,
-                            2.0f, 5.0f,
-                            3.0f, 6.0f, };
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 2, h = 1, w = 3;
-    const char* test_node_name = "permute";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "permute");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[6] = {0};
-    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 6);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/permute_param.h"
+
+int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Permute");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct permute_param* param = (struct permute_param*)(struct node*)test_node->op.param_mem;
+
+    param->flag = 0;
+    param->order0 = 0;
+    param->order1 = 2;
+    param->order2 = 3;
+    param->order3 = 1;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[6] = {
+    1.0f,
+    2.0f,
+    3.0f,
+    4.0f,
+    5.0f,
+    6.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[6] = {
+    1.0f,
+    4.0f,
+    2.0f,
+    5.0f,
+    3.0f,
+    6.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 2, h = 1, w = 3;
+    const char* test_node_name = "permute";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "permute");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[6] = {0};
+    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 6);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_pooling.cpp b/tests/op/test_timvx_op_pooling.cpp
index 61a02c7ac..65575616f 100644
--- a/tests/op/test_timvx_op_pooling.cpp
+++ b/tests/op/test_timvx_op_pooling.cpp
@@ -22,21 +22,23 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 #include "operator/prototype/pooling_param.h"
 
-
 int create_test_pool_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Pooling");
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Pooling");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -50,7 +52,7 @@ int create_test_pool_node(graph_t graph, const char* input_name, const char* nod
     set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
 
     /* set params */
-    struct pool_param* pool_param = ( struct pool_param* )(struct node* )test_node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)(struct node*)test_node->op.param_mem;
 
     pool_param->pool_method = POOL_MAX;
     pool_param->global = 0;
@@ -99,7 +101,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_pool_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -122,21 +124,21 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
 
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data + i * cstep;
+        float* output_value = (float*)output_data + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.01)
diff --git a/tests/op/test_timvx_op_prelu.cpp b/tests/op/test_timvx_op_prelu.cpp
index fe3fc0a63..57b7e9bcd 100644
--- a/tests/op/test_timvx_op_prelu.cpp
+++ b/tests/op/test_timvx_op_prelu.cpp
@@ -22,20 +22,22 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 
-
 int create_test_prelu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
     node_t test_node = create_graph_node(graph, node_name, "PReLU");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -48,7 +50,7 @@ int create_test_prelu_node(graph_t graph, const char* input_name, const char* no
 
     int dims[4];
     get_tensor_shape(input_tensor, dims, 4);
-    int slope_dims[1] = {dims[1]};  // channel num
+    int slope_dims[1] = {dims[1]}; // channel num
     set_tensor_shape(slope_tensor, slope_dims, 1);
 
     /* input tensors of test node */
@@ -90,7 +92,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_prelu_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -120,21 +122,21 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
 
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data + i * cstep;
+        float* output_value = (float*)output_data + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - result_value[i]) > 0.01)
diff --git a/tests/op/test_timvx_op_relu.cpp b/tests/op/test_timvx_op_relu.cpp
index 1ed17f270..c7143ca78 100644
--- a/tests/op/test_timvx_op_relu.cpp
+++ b/tests/op/test_timvx_op_relu.cpp
@@ -22,20 +22,22 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 
-
 int create_test_relu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
     node_t test_node = create_graph_node(graph, node_name, "ReLU");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -78,7 +80,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -101,21 +103,21 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
 
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data + i * cstep;
+        float* output_value = (float*)output_data + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.01)
diff --git a/tests/op/test_timvx_op_relu1.cpp b/tests/op/test_timvx_op_relu1.cpp
index 8e3134a8d..78e60376d 100644
--- a/tests/op/test_timvx_op_relu1.cpp
+++ b/tests/op/test_timvx_op_relu1.cpp
@@ -22,20 +22,22 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 
-
 int create_test_relu1_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
     node_t test_node = create_graph_node(graph, node_name, "ReLU1");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -78,7 +80,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu1_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -101,21 +103,21 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
 
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
+    float* output_data = (float*)malloc(output_size * sizeof(float));
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data + i * cstep;
+        float* output_value = (float*)output_data + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.01)
diff --git a/tests/op/test_timvx_op_reshape.cpp b/tests/op/test_timvx_op_reshape.cpp
index b9190d086..a511fe7ff 100644
--- a/tests/op/test_timvx_op_reshape.cpp
+++ b/tests/op/test_timvx_op_reshape.cpp
@@ -1,180 +1,192 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/reshape_param.h"
-
-
-int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Reshape");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct reshape_param* param = ( struct reshape_param* )(struct node* )test_node->op.param_mem;
-
-    param->dim_size = 4;
-
-    int* shape_tmp = ( int* )malloc(param->dim_size * sizeof(int));
-    shape_tmp[0] = 1;
-    shape_tmp[1] = 1;
-    shape_tmp[2] = 3;
-    shape_tmp[3] = 2;
-
-    param->re_shape = shape_tmp;
-    param->is_onnx = 1;
-    param->reverse = 0;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[6] = {1.0f, 2.0f, 3.0f,
-                       4.0f, 5.0f, 6.0f, };
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[6] = {1.0f, 2.0f, 3.0f,
-                          4.0f, 5.0f, 6.0f, };
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 2, h = 1, w = 3;
-    const char* test_node_name = "reshape";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "reshape");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[6] = {0};
-    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 6);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/reshape_param.h"
+
+int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Reshape");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct reshape_param* param = (struct reshape_param*)(struct node*)test_node->op.param_mem;
+
+    param->dim_size = 4;
+
+    int* shape_tmp = (int*)malloc(param->dim_size * sizeof(int));
+    shape_tmp[0] = 1;
+    shape_tmp[1] = 1;
+    shape_tmp[2] = 3;
+    shape_tmp[3] = 2;
+
+    param->re_shape = shape_tmp;
+    param->is_onnx = 1;
+    param->reverse = 0;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[6] = {
+    1.0f,
+    2.0f,
+    3.0f,
+    4.0f,
+    5.0f,
+    6.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[6] = {
+    1.0f,
+    2.0f,
+    3.0f,
+    4.0f,
+    5.0f,
+    6.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 2, h = 1, w = 3;
+    const char* test_node_name = "reshape";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "reshape");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[6] = {0};
+    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 6);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_resize.cpp b/tests/op/test_timvx_op_resize.cpp
index 2c54ee129..1c64c9b8e 100644
--- a/tests/op/test_timvx_op_resize.cpp
+++ b/tests/op/test_timvx_op_resize.cpp
@@ -1,173 +1,187 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/resize_param.h"
-
-
-int create_test_resize_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Resize");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct resize_param* param = ( struct resize_param* )(struct node* )test_node->op.param_mem;
-
-    param->type = 0;
-    param->scale_w = 0.5;
-    param->scale_h = 0.5;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[16] = {1.0f, 1.0f, 1.0f, 1.0f,
-                        1.0f, 2.0f, 2.0f, 1.0f,
-                        1.0f, 2.0f, 2.0f, 1.0f,
-                        1.0f, 1.0f, 1.0f, 1.0f, };
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[4] = {1, 1, 1, 2};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 4, w = 4;
-    const char* test_node_name = "resize";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_resize_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "resize");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[16] = {0};
-    get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 16);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/resize_param.h"
+
+int create_test_resize_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Resize");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct resize_param* param = (struct resize_param*)(struct node*)test_node->op.param_mem;
+
+    param->type = 0;
+    param->scale_w = 0.5;
+    param->scale_h = 0.5;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[16] = {
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    2.0f,
+    2.0f,
+    1.0f,
+    1.0f,
+    2.0f,
+    2.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[4] = {1, 1, 1, 2};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 4, w = 4;
+    const char* test_node_name = "resize";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_resize_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "resize");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[16] = {0};
+    get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 16);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_sigmoid.cpp b/tests/op/test_timvx_op_sigmoid.cpp
index b680ff2d5..b37411f8f 100644
--- a/tests/op/test_timvx_op_sigmoid.cpp
+++ b/tests/op/test_timvx_op_sigmoid.cpp
@@ -22,20 +22,22 @@
  * Author: qtang@openailab.com
  */
 
-
 #include "test_op.h"
 
-
 int create_test_sigmoid_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Sigmoid");
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Sigmoid");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -78,7 +80,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_sigmoid_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -101,7 +103,7 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
@@ -109,13 +111,13 @@ int main(int argc, char* argv[])
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     std::vector<float> output_data(output_size);
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data.data() + i * cstep;
+        float* output_value = (float*)output_data.data() + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.01f)
diff --git a/tests/op/test_timvx_op_slice.cpp b/tests/op/test_timvx_op_slice.cpp
index f9ce16bbf..2f3f86083 100644
--- a/tests/op/test_timvx_op_slice.cpp
+++ b/tests/op/test_timvx_op_slice.cpp
@@ -1,172 +1,182 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/slice_param.h"
-
-
-int create_test_slice_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Slice");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct slice_param* param = ( struct slice_param* )(struct node* )test_node->op.param_mem;
-
-    param->axis = 1;
-    param->begin = 1;
-    param->end = 2;
-    param->isonnx = 1;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[6] = {1.0f, 2.0f, 3.0f,
-                       4.0f, 5.0f, 6.0f, };
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[3] = {4.0f, 5.0f, 6.0f, };
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 2, h = 1, w = 3;
-    const char* test_node_name = "slice";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_slice_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "slice");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[6] = {0};
-    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 6);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/slice_param.h"
+
+int create_test_slice_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Slice");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct slice_param* param = (struct slice_param*)(struct node*)test_node->op.param_mem;
+
+    param->axis = 1;
+    param->begin = 1;
+    param->end = 2;
+    param->isonnx = 1;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[6] = {
+    1.0f,
+    2.0f,
+    3.0f,
+    4.0f,
+    5.0f,
+    6.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[3] = {
+    4.0f,
+    5.0f,
+    6.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 2, h = 1, w = 3;
+    const char* test_node_name = "slice";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_slice_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "slice");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[6] = {0};
+    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 6);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_softmax.cpp b/tests/op/test_timvx_op_softmax.cpp
index 70d535d90..4d1a577fa 100644
--- a/tests/op/test_timvx_op_softmax.cpp
+++ b/tests/op/test_timvx_op_softmax.cpp
@@ -1,168 +1,176 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/softmax_param.h"
-
-
-int create_test_softmax_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Softmax");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct softmax_param* param = ( struct softmax_param* )(struct node* )test_node->op.param_mem;
-
-    param->axis = 1;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[3] = {0.0f, 1.0f, 2.0f,};
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[3] = {0.0f, 0.243164, 0.666740,};
-float output_scale = 0.003922;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 3, h = 1, w = 1;
-    const char* test_node_name = "softmax";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_softmax_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "softmax");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[3] = {0};
-    get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 3);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/softmax_param.h"
+
+int create_test_softmax_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Softmax");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct softmax_param* param = (struct softmax_param*)(struct node*)test_node->op.param_mem;
+
+    param->axis = 1;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[3] = {
+    0.0f,
+    1.0f,
+    2.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[3] = {
+    0.0f,
+    0.243164,
+    0.666740,
+};
+float output_scale = 0.003922;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 3, h = 1, w = 1;
+    const char* test_node_name = "softmax";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_softmax_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "softmax");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[3] = {0};
+    get_uint8_data(input_fp32, input_u8, 3, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 3);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_split.cpp b/tests/op/test_timvx_op_split.cpp
index 7652cc3c4..419082e6f 100644
--- a/tests/op/test_timvx_op_split.cpp
+++ b/tests/op/test_timvx_op_split.cpp
@@ -1,205 +1,219 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/split_param.h"
-
-extern "C" {
-#include "vector.h"
-}
-
-int create_test_split_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Split");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    tensor_t output_tensor1 = create_graph_tensor(graph, "out1", data_type);
-    set_node_output_tensor(test_node, 1, output_tensor1, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct split_param* param = ( struct split_param* )(struct node* )test_node->op.param_mem;
-
-    param->axis = 1;
-    param->split_dim = 2;
-
-    param->split_sizes_ = create_vector(sizeof(int), nullptr);
-
-    int tmp = 1;
-    push_vector_data(param->split_sizes_, &tmp);
-    push_vector_data(param->split_sizes_, &tmp);
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[6] = {1.0f, 2.0f, 3.0f,
-                       4.0f, 5.0f, 6.0f, };
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[3] = {1.0f, 2.0f, 3.0f, };
-float output_scale = 1;
-int output_zero_point = 0;
-
-float reference_out1[3] = {4.0f, 5.0f, 6.0f, };
-float output_scale1 = 1;
-int output_zero_point1 = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 2, h = 1, w = 3;
-    const char* test_node_name = "split";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_split_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "split");
-    struct tensor* output_tensor1 = (struct tensor*)get_graph_tensor(ir_graph, "out1");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    set_tensor_quant_param(output_tensor1, &output_scale1, &output_zero_point1, 1);
-
-    // set input data
-    uint8_t input_u8[6] = {0};
-    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 6);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-
-    uint8_t* output1_u8 = ( uint8_t* )output_tensor1->data;
-    int output_size1 = output_tensor1->elem_num;
-
-    get_tensor_quant_param(output_tensor1, &output_scale1, &output_zero_point1, 1);
-    float* output_data1 = ( float* )malloc(output_size1 * sizeof(float));
-    for (int i = 0; i < output_size1; i++)
-        output_data1[i] = (( float )output1_u8[i] - ( float )output_zero_point1) * output_scale1;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-    for (int i = 0; i< output_size1; i++)
-    {
-        if (fabsf(output_data1[i] - reference_out1[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data1[i], reference_out1[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/split_param.h"
+
+extern "C" {
+#include "vector.h"
+}
+
+int create_test_split_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Split");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    tensor_t output_tensor1 = create_graph_tensor(graph, "out1", data_type);
+    set_node_output_tensor(test_node, 1, output_tensor1, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct split_param* param = (struct split_param*)(struct node*)test_node->op.param_mem;
+
+    param->axis = 1;
+    param->split_dim = 2;
+
+    param->split_sizes_ = create_vector(sizeof(int), nullptr);
+
+    int tmp = 1;
+    push_vector_data(param->split_sizes_, &tmp);
+    push_vector_data(param->split_sizes_, &tmp);
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[6] = {
+    1.0f,
+    2.0f,
+    3.0f,
+    4.0f,
+    5.0f,
+    6.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[3] = {
+    1.0f,
+    2.0f,
+    3.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+float reference_out1[3] = {
+    4.0f,
+    5.0f,
+    6.0f,
+};
+float output_scale1 = 1;
+int output_zero_point1 = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 2, h = 1, w = 3;
+    const char* test_node_name = "split";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_split_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "split");
+    struct tensor* output_tensor1 = (struct tensor*)get_graph_tensor(ir_graph, "out1");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    set_tensor_quant_param(output_tensor1, &output_scale1, &output_zero_point1, 1);
+
+    // set input data
+    uint8_t input_u8[6] = {0};
+    get_uint8_data(input_fp32, input_u8, 6, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 6);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    uint8_t* output1_u8 = (uint8_t*)output_tensor1->data;
+    int output_size1 = output_tensor1->elem_num;
+
+    get_tensor_quant_param(output_tensor1, &output_scale1, &output_zero_point1, 1);
+    float* output_data1 = (float*)malloc(output_size1 * sizeof(float));
+    for (int i = 0; i < output_size1; i++)
+        output_data1[i] = ((float)output1_u8[i] - (float)output_zero_point1) * output_scale1;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+    for (int i = 0; i < output_size1; i++)
+    {
+        if (fabsf(output_data1[i] - reference_out1[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data1[i], reference_out1[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_tanh.cpp b/tests/op/test_timvx_op_tanh.cpp
index fdf93bd89..4f8310940 100644
--- a/tests/op/test_timvx_op_tanh.cpp
+++ b/tests/op/test_timvx_op_tanh.cpp
@@ -25,17 +25,20 @@
 #include <vector>
 #include "test_op.h"
 
-
 int create_test_tanh_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
 {
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
 
     /* create the test node */
     node_t test_node = create_graph_node(graph, node_name, "Tanh");
 
     tensor_t input_tensor = get_graph_tensor(graph, input_name);
 
-    if(NULL == input_tensor)
+    if (NULL == input_tensor)
     {
         fprintf(stderr, "create test node failed.\n");
         return -1;
@@ -78,7 +81,7 @@ int main(int argc, char* argv[])
 
     // create
     graph_t graph = create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_tanh_node);
-    if(NULL == graph)
+    if (NULL == graph)
         return -1;
 
     // set quantize params
@@ -101,22 +104,22 @@ int main(int argc, char* argv[])
 
     // get output and dequant
     struct tensor* output_tensor = (struct tensor*)get_graph_output_tensor(graph, 0, 0);
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
     int output_size = output_tensor->elem_num;
     int out_c = output_tensor->dims[1];
     int cstep = output_tensor->dims[2] * output_tensor->dims[3];
 
     get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
     std::vector<float> output_data(output_size);
-    
+
     for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
 
     // check the result
     ret = 0;
-    for (int i = 0; i< out_c; i++)
+    for (int i = 0; i < out_c; i++)
     {
-        float* output_value =  (float *)output_data.data() + i * cstep;
+        float* output_value = (float*)output_data.data() + i * cstep;
         for (int j = 0; j < cstep; j++)
         {
             if (fabsf(output_value[j] - reference_out[i]) > 0.05f)
diff --git a/tests/op/test_timvx_op_transpose.cpp b/tests/op/test_timvx_op_transpose.cpp
index 1f5b4e424..9233d5009 100644
--- a/tests/op/test_timvx_op_transpose.cpp
+++ b/tests/op/test_timvx_op_transpose.cpp
@@ -1,183 +1,223 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/transpose_param.h"
-
-
-int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Transpose");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct transpose_param* param = ( struct transpose_param* )(struct node* )test_node->op.param_mem;
-
-    int* t_shape = (int*) malloc(sizeof(int) * 5 ) ;
-    t_shape[0] = 0;
-    t_shape[1] = 2;
-    t_shape[2] = 1;
-    t_shape[3] = 3;
-    t_shape[4] = 4;
-
-    param->tr_shape_size = 5;
-    param->tr_shape = t_shape;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[24] = {1.0f, 2.0f, 3.0f, 4.0f,
-                        5.0f, 6.0f, 7.0f, 8.0f,
-                        9.0f, 10.0f, 11.0f, 12.0f,
-                        13.0f, 14.0f, 15.0f, 16.0f,
-                        17.0f, 18.0f, 19.0f, 20.0f,
-                        21.0f, 22.0f, 23.0f, 24.0f,};
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[24] = {1.0f, 2.0f, 3.0f, 4.0f,
-                           13.0f, 14.0f, 15.0f, 16.0f,
-                           5.0f, 6.0f, 7.0f, 8.0f,
-                           17.0f, 18.0f, 19.0f, 20.0f,
-                           9.0f, 10.0f, 11.0f, 12.0f,
-                           21.0f, 22.0f, 23.0f, 24.0f,};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 2, c = 3, h = 2, w = 2;
-    const char* test_node_name = "permute";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node, 5);
-    if(NULL == ir_graph)
-        return -1;
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "permute");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[24] = {0};
-    get_uint8_data(input_fp32, input_u8, 24, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 24);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/transpose_param.h"
+
+int create_test_permute_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Transpose");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct transpose_param* param = (struct transpose_param*)(struct node*)test_node->op.param_mem;
+
+    int* t_shape = (int*)malloc(sizeof(int) * 5);
+    t_shape[0] = 0;
+    t_shape[1] = 2;
+    t_shape[2] = 1;
+    t_shape[3] = 3;
+    t_shape[4] = 4;
+
+    param->tr_shape_size = 5;
+    param->tr_shape = t_shape;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[24] = {
+    1.0f,
+    2.0f,
+    3.0f,
+    4.0f,
+    5.0f,
+    6.0f,
+    7.0f,
+    8.0f,
+    9.0f,
+    10.0f,
+    11.0f,
+    12.0f,
+    13.0f,
+    14.0f,
+    15.0f,
+    16.0f,
+    17.0f,
+    18.0f,
+    19.0f,
+    20.0f,
+    21.0f,
+    22.0f,
+    23.0f,
+    24.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[24] = {
+    1.0f,
+    2.0f,
+    3.0f,
+    4.0f,
+    13.0f,
+    14.0f,
+    15.0f,
+    16.0f,
+    5.0f,
+    6.0f,
+    7.0f,
+    8.0f,
+    17.0f,
+    18.0f,
+    19.0f,
+    20.0f,
+    9.0f,
+    10.0f,
+    11.0f,
+    12.0f,
+    21.0f,
+    22.0f,
+    23.0f,
+    24.0f,
+};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 2, c = 3, h = 2, w = 2;
+    const char* test_node_name = "permute";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_permute_node, 5);
+    if (NULL == ir_graph)
+        return -1;
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "permute");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[24] = {0};
+    get_uint8_data(input_fp32, input_u8, 24, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 24);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tests/op/test_timvx_op_upsampling.cpp b/tests/op/test_timvx_op_upsampling.cpp
index aa30baa5d..3f8e45a88 100644
--- a/tests/op/test_timvx_op_upsampling.cpp
+++ b/tests/op/test_timvx_op_upsampling.cpp
@@ -1,171 +1,185 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-
-#include "test_op.h"
-
-#include "graph/graph.h"
-#include "graph/node.h"
-#include "graph/tensor.h"
-#include "operator/prototype/upsample_param.h"
-
-
-int create_test_interp_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout; (void)n; (void)c; (void)h; (void)w;
-
-    /* create the test node */
-    struct node* test_node = (struct node* )create_graph_node(graph, node_name, "Upsample");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if(NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed.\n");
-        return -1;
-    }
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set params */
-    struct upsample_param* param = ( struct upsample_param* )(struct node* )test_node->op.param_mem;
-
-    param->scale = 0.5;
-
-    return 0;
-}
-
-/*
- * scale = (max - min) / 255
- * zero_point = -min / scale
- * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
- * float32 = (uint8 - zero_point) * scale
- */
-float input_fp32[16] = {1.0f, 1.0f, 1.0f, 1.0f,
-                        1.0f, 2.0f, 2.0f, 1.0f,
-                        1.0f, 2.0f, 2.0f, 1.0f,
-                        1.0f, 1.0f, 1.0f, 1.0f, };
-float input_scale = 1;
-int input_zero_point = 0;
-
-float reference_out[4] = {1, 1, 1, 2};
-float output_scale = 1;
-int output_zero_point = 0;
-
-
-void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
-{
-    for (int i = 0; i < size; i++)
-    {
-        int udata = (round)(data_fp32[i] / scale + zero_point);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-
-        date_u8[i] = udata;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 1, h = 4, w = 4;
-    const char* test_node_name = "upsample";
-    int data_type = TENGINE_DT_UINT8;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed.\n");
-
-    // create
-    struct graph* ir_graph = (struct graph* )create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_interp_node);
-    if(NULL == ir_graph)
-        return -1;
-
-    set_log_level(LOG_INFO);
-    dump_graph(ir_graph);
-
-    // set quantize params
-    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
-    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "upsample");
-
-//    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
-    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
-    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-
-    // set input data
-    uint8_t input_u8[16] = {0};
-    get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point);
-    set_tensor_buffer(input_tensor, input_u8, 16);
-
-
-    // set bias data
-    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
-
-    // graph run
-    ret = test_graph_run(ir_graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(ir_graph);
-        return -1;
-    }
-
-    // get output and dequant
-    uint8_t* output_u8 = ( uint8_t* )output_tensor->data;
-    int output_size = output_tensor->elem_num;
-
-    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
-    float* output_data = ( float* )malloc(output_size * sizeof(float));
-    for (int i = 0; i < output_size; i++)
-        output_data[i] = (( float )output_u8[i] - ( float )output_zero_point) * output_scale;
-
-    // check the result
-    ret = 0;
-    for (int i = 0; i< output_size; i++)
-    {
-        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
-        {
-            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
-            ret = -1;
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(ir_graph);
-
-    return ret;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include "test_op.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "operator/prototype/upsample_param.h"
+
+int create_test_interp_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
+{
+    (void)layout;
+    (void)n;
+    (void)c;
+    (void)h;
+    (void)w;
+
+    /* create the test node */
+    struct node* test_node = (struct node*)create_graph_node(graph, node_name, "Upsample");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if (NULL == input_tensor)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    /* input tensors of test node */
+    set_node_input_tensor(test_node, 0, input_tensor);
+
+    /* output tensors of test node */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
+    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    /* set params */
+    struct upsample_param* param = (struct upsample_param*)(struct node*)test_node->op.param_mem;
+
+    param->scale = 0.5;
+
+    return 0;
+}
+
+/*
+ * scale = (max - min) / 255
+ * zero_point = -min / scale
+ * uint8   = clip(round(float32 / scale) + zero_point, 0, 255)
+ * float32 = (uint8 - zero_point) * scale
+ */
+float input_fp32[16] = {
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    2.0f,
+    2.0f,
+    1.0f,
+    1.0f,
+    2.0f,
+    2.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+    1.0f,
+};
+float input_scale = 1;
+int input_zero_point = 0;
+
+float reference_out[4] = {1, 1, 1, 2};
+float output_scale = 1;
+int output_zero_point = 0;
+
+void get_uint8_data(float* data_fp32, uint8_t* date_u8, int size, float scale, int zero_point)
+{
+    for (int i = 0; i < size; i++)
+    {
+        int udata = (round)(data_fp32[i] / scale + zero_point);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+
+        date_u8[i] = udata;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int n = 1, c = 1, h = 4, w = 4;
+    const char* test_node_name = "upsample";
+    int data_type = TENGINE_DT_UINT8;
+    int layout = TENGINE_LAYOUT_NCHW;
+
+    // init
+    int ret = test_graph_init();
+    if (0 != ret)
+        fprintf(stderr, "Tengine init failed.\n");
+
+    // create
+    struct graph* ir_graph = (struct graph*)create_timvx_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_interp_node);
+    if (NULL == ir_graph)
+        return -1;
+
+    set_log_level(LOG_INFO);
+    dump_graph(ir_graph);
+
+    // set quantize params
+    struct tensor* input_tensor = (struct tensor*)get_graph_tensor(ir_graph, "input_node");
+    struct tensor* output_tensor = (struct tensor*)get_graph_tensor(ir_graph, "upsample");
+
+    //    tensor_t weight_tesnor = get_graph_input_tensor(ir_graph, 1, 0);
+    set_tensor_quant_param(input_tensor, &input_scale, &input_zero_point, 1);
+    set_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+
+    // set input data
+    uint8_t input_u8[16] = {0};
+    get_uint8_data(input_fp32, input_u8, 16, input_scale, input_zero_point);
+    set_tensor_buffer(input_tensor, input_u8, 16);
+
+    // set bias data
+    // fill_input_uint8_tensor_by_index(graph, 0, 0, 0.0f);
+
+    // graph run
+    ret = test_graph_run(ir_graph);
+    if (0 != ret)
+    {
+        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
+        test_graph_release(ir_graph);
+        return -1;
+    }
+
+    // get output and dequant
+    uint8_t* output_u8 = (uint8_t*)output_tensor->data;
+    int output_size = output_tensor->elem_num;
+
+    get_tensor_quant_param(output_tensor, &output_scale, &output_zero_point, 1);
+    float* output_data = (float*)malloc(output_size * sizeof(float));
+    for (int i = 0; i < output_size; i++)
+        output_data[i] = ((float)output_u8[i] - (float)output_zero_point) * output_scale;
+
+    // check the result
+    ret = 0;
+    for (int i = 0; i < output_size; i++)
+    {
+        if (fabsf(output_data[i] - reference_out[i]) > 0.1)
+        {
+            fprintf(stderr, "index:%d, a:%f, b:%f\n", i, output_data[i], reference_out[i]);
+            ret = -1;
+        }
+    }
+
+    if (ret == 0)
+        fprintf(stderr, "test pass.\n");
+    else
+        fprintf(stderr, "test failed.\n");
+
+    // exit
+    test_graph_release(ir_graph);
+
+    return ret;
+}
diff --git a/tools/convert_tool/caffe/caffe2tengine.cpp b/tools/convert_tool/caffe/caffe2tengine.cpp
index bf8556bb0..74f777abe 100644
--- a/tools/convert_tool/caffe/caffe2tengine.cpp
+++ b/tools/convert_tool/caffe/caffe2tengine.cpp
@@ -28,13 +28,12 @@
 *   SELF DEFINE VARIABLE
 *   FOR CAFFE SERIALIZER
 */
-const int OP_VERSION=1;
-
+const int OP_VERSION = 1;
 
 int caffe_serializer::load_text_file(std::string model_file, te_caffe::NetParameter& caffe_net)
 {
     std::ifstream is(model_file.c_str(), std::ios::in);
-    
+
     if (!is.is_open())
     {
         TLOG_ERR("cannot open file: %s \n", model_file.c_str());
@@ -70,7 +69,6 @@ int caffe_serializer::load_binary_file(std::string model_file, te_caffe::NetPara
     coded_input.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
 #endif
 
-
     bool ret = caffe_net.ParseFromCodedStream(&coded_input);
 
     is.close();
@@ -82,7 +80,7 @@ int caffe_serializer::load_binary_file(std::string model_file, te_caffe::NetPara
 }
 bool caffe_serializer::find_op_load_method(const std::string& op_name)
 {
-    if(op_load_map.count(op_name))
+    if (op_load_map.count(op_name))
         return true;
 
     return false;
@@ -96,7 +94,7 @@ ir_tensor_t* find_caffe_tensor(ir_graph_t* graph, const std::string& tensor_name
         if (tensor->name == tensor_name)
             return tensor;
     }
-    
+
     return nullptr;
 }
 
@@ -122,13 +120,13 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara
     {
         const te_caffe::LayerParameter& layer_param = test_net.layer(i);
         const std::string& caffe_op_name = layer_param.type();
-        if(!find_op_load_method(caffe_op_name))
+        if (!find_op_load_method(caffe_op_name))
         {
             // printf("%s \n", caffe_op_name.c_str());
-            auto it = find(no_supported_op.begin(),no_supported_op.end(),caffe_op_name);
-            if(it == no_supported_op.end())
+            auto it = find(no_supported_op.begin(), no_supported_op.end(), caffe_op_name);
+            if (it == no_supported_op.end())
             {
-                if(caffe_op_name == "Constant")
+                if (caffe_op_name == "Constant")
                     continue;
                 no_supported_op.push_back(caffe_op_name);
             }
@@ -137,7 +135,7 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara
     if (no_supported_op.size())
     {
         TLOG_ERR("These %d op are not supported\n{ ", no_supported_op.size());
-        for(int j = 0; j < (int) no_supported_op.size(); j++)
+        for (int j = 0; j < (int)no_supported_op.size(); j++)
         {
             TLOG_ERR("%s ", no_supported_op[j].c_str());
         }
@@ -165,12 +163,12 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara
             // ir_tensor_t* tensor = find_caffe_tensor(graph, orig_name);
 
             int tensor_id = get_ir_tensor_index_from_name(graph, orig_name.c_str());
-            ir_tensor_t* tensor = get_ir_graph_tensor(graph, tensor_id);        
+            ir_tensor_t* tensor = get_ir_graph_tensor(graph, tensor_id);
             // fprintf(stderr, "input tensor : %s \n", tensor->name);
 
             set_ir_node_input_tensor(ir_node, i, tensor);
 
-            if(train_name_map.count(layer_param.name()))
+            if (train_name_map.count(layer_param.name()))
             {
                 // printf("train data copy in: %s \n", layer_param.name().c_str());
 
@@ -182,7 +180,8 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara
                 if (p_train->blobs_size())
                 {
                     blob_load_t func = blob_load_map[caffe_op_name];
-                    if (!func(graph, ir_node, *p_train)){
+                    if (!func(graph, ir_node, *p_train))
+                    {
                         break;
                     }
                 }
@@ -191,7 +190,6 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara
             // output_tensors.push_back(tensor);
             // fprintf(stderr, "output_tensors num: %d %s\n", (int)output_tensors.size(), output_tensors[(int)output_tensors.size()-1]->name);
         }
-       
 
         // fprintf(stderr, "layer_param.top_size() %d %s \n", layer_param.top_size(), caffe_op_name.c_str());
         for (int i = 0; i < layer_param.top_size(); i++)
@@ -224,7 +222,7 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara
             TLOG_ERR("load op %s func failed in node %s .\n", caffe_op_name.c_str(), ir_node->name);
             return -1;
         }
-        #if 0
+#if 0
         if(train_name_map.count(layer_param.name()))
         {
             // fprintf(stderr, "train_name_map : %s \n", layer_param.name().c_str());
@@ -240,15 +238,14 @@ int caffe_serializer::load_graph_node(ir_graph_t* graph, const te_caffe::NetPara
                 }
             }
         }
-        #endif
-
+#endif
     }
     // printf("tensor \n");
-    if (n < layer_number){
+    if (n < layer_number)
+    {
         fprintf(stderr, "Check layer number error ! \n");
         return -1;
     }
-
 }
 int caffe_serializer::load_tensor_data(ir_graph_t* graph, const te_caffe::NetParameter test_net, const te_caffe::NetParameter train_net)
 {
@@ -265,7 +262,7 @@ int caffe_serializer::load_tensor_data(ir_graph_t* graph, const te_caffe::NetPar
     layer_number = test_net.layer_size();
 
     int size = (int)op_load_map.size();
-    
+
     int n;
     // printf("layer number : %d \n", layer_number);
     for (n = 0; n < layer_number; n++)
@@ -277,16 +274,16 @@ int caffe_serializer::load_tensor_data(ir_graph_t* graph, const te_caffe::NetPar
         if (ir_node == NULL)
             return -1;
 
-        if(train_name_map.count(layer_param.name()))
+        if (train_name_map.count(layer_param.name()))
         {
-       
             const te_caffe::LayerParameter* p_train;
 
             p_train = train_name_map[layer_param.name()];
             if (p_train->blobs_size())
             {
                 blob_load_t func = blob_load_map[caffe_op_name];
-                if (!func(graph, ir_node, *p_train)){
+                if (!func(graph, ir_node, *p_train))
+                {
                     break;
                 }
             }
@@ -307,7 +304,7 @@ int caffe_serializer::load_model(ir_graph_t* graph, std::string model_file, std:
         return -1;
     fprintf(stderr, "Process 2: Finish load protobuf file \n");
     // if (load_tensor_data(graph, test_net, train_net) < 0)
-        // return -1;
+    // return -1;
     fprintf(stderr, "Process 3: Finish load graph node \n");
     if (load_graph_node(graph, test_net, train_net) < 0)
         return -1;
@@ -321,7 +318,7 @@ int caffe_serializer::load_model(ir_graph_t* graph, std::string model_file, std:
     //     return -1;
     // if (set_graph_output(graph, onnx_graph) < 0)
     //     return -1;
-    
+
     return 0;
 }
 
@@ -350,7 +347,6 @@ graph_t caffe_serializer::caffe2tengine(std::string model_file, std::string prot
     return ir_graph;
 }
 
-
 static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::vector<std::string>& name_list,
                           const std::vector<std::string>& layout_list, const te_caffe::LayerParameter& layer_param)
 
@@ -361,7 +357,7 @@ static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::v
     {
         std::string node_name = ir_node->name;
         std::string new_tensor_name = node_name + "/" + name_list[i];
-        
+
         ir_tensor_t* ir_tensor = create_ir_tensor(ir_graph, new_tensor_name.c_str(), TENGINE_DT_FP32);
 
         /* load tensor data*/
@@ -369,12 +365,12 @@ static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::v
         const te_caffe::BlobProto& blob = layer_param.blobs(i);
 
         int dim_num = 0;
-        int *dims;
+        int* dims;
         if (blob.has_shape())
         {
             dim_num = blob.shape().dim_size();
-            dims = (int*)malloc(sizeof(int)*dim_num);
-            memset(dims, 0, sizeof(int)*dim_num);
+            dims = (int*)malloc(sizeof(int) * dim_num);
+            memset(dims, 0, sizeof(int) * dim_num);
             for (int i = 0; i < dim_num; i++)
             {
                 dims[i] = blob.shape().dim(i);
@@ -393,17 +389,17 @@ static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::v
             while (temp[start] == 1)
                 start++;
 
-            dim_num = temp.size() - start; 
-            dims = (int*)malloc(sizeof(int)*dim_num);
-            memset(dims, 0, sizeof(int)*dim_num);
+            dim_num = temp.size() - start;
+            dims = (int*)malloc(sizeof(int) * dim_num);
+            memset(dims, 0, sizeof(int) * dim_num);
             for (unsigned int i = start; i < temp.size(); i++)
                 dims[i] = temp[i];
         }
-        if ( dim_num > 0)
+        if (dim_num > 0)
         {
             set_ir_tensor_shape(ir_tensor, dims, dim_num);
             ir_tensor->tensor_type = TENSOR_TYPE_CONST;
-            int tensor_size = ir_tensor->elem_num *  sizeof(float);
+            int tensor_size = ir_tensor->elem_num * sizeof(float);
             ir_tensor->data = sys_malloc(tensor_size);
             float* ptr = (float*)ir_tensor->data;
 
@@ -418,12 +414,10 @@ static void LoadCaffeBlob(ir_graph_t* ir_graph, ir_node_t* ir_node, const std::v
         // int  index = get_ir_node_index_from_name(ir_graph, new_tensor_name.c_str());
 
         set_ir_node_output_tensor(new_ir_node, 0, ir_tensor);
-        set_ir_node_input_tensor(ir_node, i+1, ir_tensor);
+        set_ir_node_input_tensor(ir_node, i + 1, ir_tensor);
     }
 }
 
-
-
 static void CreatePresetNode(ir_graph_t* graph, ir_node_t* ir_node, const char* name, const char* layout,
                              std::vector<int>& temp, float val, int index)
 {
@@ -434,8 +428,8 @@ static void CreatePresetNode(ir_graph_t* graph, ir_node_t* ir_node, const char*
     int dim_num = temp.size();
     if (dim_num > 0)
     {
-        int *dims = (int*)malloc(sizeof(int)*dim_num);
-        memset(dims, 0, sizeof(int)*dim_num);
+        int* dims = (int*)malloc(sizeof(int) * dim_num);
+        memset(dims, 0, sizeof(int) * dim_num);
         int elem_size = 1;
 
         for (unsigned int i = 0; i < dim_num; i++)
@@ -445,7 +439,7 @@ static void CreatePresetNode(ir_graph_t* graph, ir_node_t* ir_node, const char*
         }
         set_ir_tensor_shape(ir_tensor, dims, dim_num);
         ir_tensor->tensor_type = TENSOR_TYPE_CONST;
-        int tensor_size = elem_size *  sizeof(float);
+        int tensor_size = elem_size * sizeof(float);
         ir_tensor->data = sys_malloc(tensor_size);
 
         float* ptr = (float*)ir_tensor->data;
@@ -459,13 +453,11 @@ static void CreatePresetNode(ir_graph_t* graph, ir_node_t* ir_node, const char*
     set_ir_node_input_tensor(new_ir_node, 0, ir_tensor);
 }
 
-
 bool load_batchnorm_blob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
-   const te_caffe::BlobProto& rescale_blob = layer_param.blobs(2);
-
+    const te_caffe::BlobProto& rescale_blob = layer_param.blobs(2);
 
-    struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )node->op.param_mem;
+    struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)node->op.param_mem;
 
     batchnorm_param->rescale_factor = rescale_blob.data(0);
 
@@ -486,13 +478,13 @@ bool load_batchnorm_blob(ir_graph_t* graph, ir_node_t* node, const te_caffe::Lay
         std::vector<std::string> layout_list = {"W", "W"};
 
         LoadCaffeBlob(graph, node, name_list, layout_list, layer_param);
-    } 
+    }
     return 0;
 }
 
 int load_batchnorm(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
-    struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )node->op.param_mem;
+    struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)node->op.param_mem;
 
     const te_caffe::BatchNormParameter& bn_param = layer_param.batch_norm_param();
 
@@ -511,7 +503,7 @@ int load_softmax(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParame
 {
     const te_caffe::SoftmaxParameter& softmax_param = layer_param.softmax_param();
 
-    struct softmax_param* param = ( struct softmax_param* )node->op.param_mem;
+    struct softmax_param* param = (struct softmax_param*)node->op.param_mem;
 
     if (softmax_param.has_axis())
         param->axis = softmax_param.axis();
@@ -521,13 +513,12 @@ int load_softmax(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParame
     return 0;
 }
 
-
 int load_conv(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
     const te_caffe::ConvolutionParameter& conv_param = layer_param.convolution_param();
     // const te_caffe::LayerParameter& layer_param = caffe_net.layer(i);
     const std::string& caffe_op_name = layer_param.type();
-    struct conv_param* param = ( struct conv_param* )node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)node->op.param_mem;
 
     if (conv_param.has_kernel_h() && conv_param.has_kernel_w())
     {
@@ -596,7 +587,7 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParamet
 {
     const te_caffe::ConvolutionParameter& conv_param = layer_param.convolution_param();
 
-    struct deconv_param* param = ( struct deconv_param* )node->op.param_mem;
+    struct deconv_param* param = (struct deconv_param*)node->op.param_mem;
 
     if (conv_param.has_kernel_h() && conv_param.has_kernel_w())
     {
@@ -667,12 +658,11 @@ PoolArg ConvertCaffePool(te_caffe::PoolingParameter_PoolMethod method)
     return kPoolMax;
 }
 
-
 int load_fc(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
     const te_caffe::InnerProductParameter& ip_param = layer_param.inner_product_param();
 
-    struct fc_param* param = ( struct fc_param* )node->op.param_mem;
+    struct fc_param* param = (struct fc_param*)node->op.param_mem;
     param->num_output = ip_param.num_output();
 
     /* Load weight and bias blob */
@@ -700,7 +690,7 @@ int load_normalize(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerPara
 {
     const te_caffe::NormalizeParameter& normalize_param = layer_param.norm_param();
 
-    struct normalize_param* param = ( struct normalize_param* )node->op.param_mem;
+    struct normalize_param* param = (struct normalize_param*)node->op.param_mem;
 
     param->across_spatial = normalize_param.across_spatial();
     param->channel_shared = normalize_param.channel_shared();
@@ -708,10 +698,9 @@ int load_normalize(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerPara
     return 0;
 }
 
-
 int load_scale(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
-    struct scale_param* param = ( struct scale_param* )node->op.param_mem;
+    struct scale_param* param = (struct scale_param*)node->op.param_mem;
 
     const te_caffe::ScaleParameter& scale_param = layer_param.scale_param();
 
@@ -724,7 +713,6 @@ int load_scale(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParamete
     if (scale_param.has_bias_term())
         param->bias_term = scale_param.bias_term();
 
-
     if (layer_param.blobs_size())
     {
         std::vector<std::string> name_list = {"gamma", "beta"};
@@ -738,7 +726,7 @@ int load_scale(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParamete
 
 int load_relu(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
-    struct relu_param* param = ( struct relu_param* )node->op.param_mem;
+    struct relu_param* param = (struct relu_param*)node->op.param_mem;
 
     const te_caffe::ReLUParameter& caffe_param = layer_param.relu_param();
 
@@ -752,13 +740,12 @@ int load_relu(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter
 
 int load_split(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
-    struct split_param* param = ( struct split_param* )node->op.param_mem;
+    struct split_param* param = (struct split_param*)node->op.param_mem;
     param->is_caffe = true;
 
     return 0;
 }
 
-
 #if 0
 int load_data(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
@@ -786,7 +773,7 @@ int load_pool(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter
 {
     const te_caffe::PoolingParameter& pool_param = layer_param.pooling_param();
 
-    struct pool_param* param = ( struct pool_param* )node->op.param_mem;
+    struct pool_param* param = (struct pool_param*)node->op.param_mem;
 
     // param.alg = ConvertCaffePool(pool_param.pool());
     if (pool_param.has_kernel_size())
@@ -830,7 +817,6 @@ int load_pool(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter
 
     param->caffe_flavor = 1;
 
-
     return 0;
 }
 static EltType ConvertCaffeEltwise(te_caffe::EltwiseParameter_EltwiseOp method)
@@ -847,7 +833,7 @@ static EltType ConvertCaffeEltwise(te_caffe::EltwiseParameter_EltwiseOp method)
 int load_eltwise(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
     const te_caffe::EltwiseParameter& eltwise_param = layer_param.eltwise_param();
-    struct eltwise_param* param = ( struct eltwise_param* )node->op.param_mem;
+    struct eltwise_param* param = (struct eltwise_param*)node->op.param_mem;
     // defalt: SUM
     param->type = ELT_SUM;
     if (eltwise_param.has_operation())
@@ -889,16 +875,16 @@ int load_input(ir_graph_t* graph, ir_node_t* ir_node, const te_caffe::LayerParam
     if (dim_num == 0)
         has_shape = 0;
 
-    #if 1
+#if 1
     if (has_shape)
     {
-        int* dims = (int*)malloc(sizeof(int)*dim_num);
-        memset(dims, 0, sizeof(int)*dim_num);
-        for(int i = 0; i < dim_num ; i++)
+        int* dims = (int*)malloc(sizeof(int) * dim_num);
+        memset(dims, 0, sizeof(int) * dim_num);
+        for (int i = 0; i < dim_num; i++)
             dims[i] = dim[i];
         set_ir_tensor_shape(tensor, dims, dim_num);
     }
-    #endif
+#endif
 
     ir_node_t* node = create_ir_node(graph, val.c_str(), OP_INPUT, OP_VERSION);
     set_ir_node_output_tensor(node, 0, tensor);
@@ -935,7 +921,6 @@ int LoadDeconvolutionBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::La
     return true;
 }
 
-
 int LoadBiasBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
     if (layer_param.blobs_size())
@@ -948,7 +933,6 @@ int LoadBiasBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParame
     return true;
 }
 
-
 int LoadFullyConnectedBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param)
 {
     if (layer_param.blobs_size())
@@ -995,11 +979,10 @@ int LoadBatchNormBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerP
 {
     const te_caffe::BlobProto& rescale_blob = layer_param.blobs(2);
 
-    struct batchnorm_param* param = ( struct batchnorm_param* )node->op.param_mem;
+    struct batchnorm_param* param = (struct batchnorm_param*)node->op.param_mem;
 
     param->rescale_factor = rescale_blob.data(0);
 
-
     /* for compatible reason, create the two tensors: gamma (1.0) and beta (0.0) */
 
     /* get the dim, i.e., channel size */
@@ -1025,34 +1008,33 @@ int LoadBatchNormBlob(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerP
 */
 void caffe_serializer::register_op_load()
 {
-    op_load_map["BatchNorm"]                    = std::pair<int, op_load_t>(OP_UNARY,           load_batchnorm);
-    op_load_map["Convolution"]                  = std::pair<int, op_load_t>(OP_CONV,            load_conv);
-    op_load_map["DeConvolution"]                = std::pair<int, op_load_t>(OP_DECONV,          load_deconv);
-    op_load_map["Softmax"]                      = std::pair<int, op_load_t>(OP_SOFTMAX,         load_softmax);
-    op_load_map["PReLU"]                        = std::pair<int, op_load_t>(OP_PRELU,           load_prelu);
-    op_load_map["InnerProduct"]                 = std::pair<int, op_load_t>(OP_FC,              load_fc);
-    op_load_map["SoftmaxWithLoss"]              = std::pair<int, op_load_t>(OP_SOFTMAX,         load_softmax);
-    op_load_map["Normalize"]                    = std::pair<int, op_load_t>(OP_NORMALIZE,       load_normalize);
-    op_load_map["Scale"]                        = std::pair<int, op_load_t>(OP_SCALE,           load_scale);
-    op_load_map["ReLU"]                         = std::pair<int, op_load_t>(OP_RELU,            load_relu);
-    op_load_map["Split"]                        = std::pair<int, op_load_t>(OP_SPLIT,           load_split);
-    op_load_map["Pooling"]                      = std::pair<int, op_load_t>(OP_POOL,            load_pool);
-    op_load_map["Eltwise"]                      = std::pair<int, op_load_t>(OP_ELTWISE,         load_eltwise);
-    op_load_map["Input"]                        = std::pair<int, op_load_t>(OP_INPUT,           load_input);
-    op_load_map["Data"]                         = std::pair<int, op_load_t>(OP_INPUT,           load_input);
-
-
-    blob_load_map["Convolution"]                = LoadConvolutionBlob;
+    op_load_map["BatchNorm"] = std::pair<int, op_load_t>(OP_UNARY, load_batchnorm);
+    op_load_map["Convolution"] = std::pair<int, op_load_t>(OP_CONV, load_conv);
+    op_load_map["DeConvolution"] = std::pair<int, op_load_t>(OP_DECONV, load_deconv);
+    op_load_map["Softmax"] = std::pair<int, op_load_t>(OP_SOFTMAX, load_softmax);
+    op_load_map["PReLU"] = std::pair<int, op_load_t>(OP_PRELU, load_prelu);
+    op_load_map["InnerProduct"] = std::pair<int, op_load_t>(OP_FC, load_fc);
+    op_load_map["SoftmaxWithLoss"] = std::pair<int, op_load_t>(OP_SOFTMAX, load_softmax);
+    op_load_map["Normalize"] = std::pair<int, op_load_t>(OP_NORMALIZE, load_normalize);
+    op_load_map["Scale"] = std::pair<int, op_load_t>(OP_SCALE, load_scale);
+    op_load_map["ReLU"] = std::pair<int, op_load_t>(OP_RELU, load_relu);
+    op_load_map["Split"] = std::pair<int, op_load_t>(OP_SPLIT, load_split);
+    op_load_map["Pooling"] = std::pair<int, op_load_t>(OP_POOL, load_pool);
+    op_load_map["Eltwise"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["Input"] = std::pair<int, op_load_t>(OP_INPUT, load_input);
+    op_load_map["Data"] = std::pair<int, op_load_t>(OP_INPUT, load_input);
+
+    blob_load_map["Convolution"] = LoadConvolutionBlob;
     // blob_load_map["Deconvolution"]              = LoadDeconvolutionBlob;
-    blob_load_map["InnerProduct"]               = LoadFullyConnectedBlob;
-    blob_load_map["BatchNorm"]                  = LoadBatchNormBlob;
-    blob_load_map["Scale"]                      = LoadScaleBlob;
+    blob_load_map["InnerProduct"] = LoadFullyConnectedBlob;
+    blob_load_map["BatchNorm"] = LoadBatchNormBlob;
+    blob_load_map["Scale"] = LoadScaleBlob;
     // blob_load_map["PReLU"]                      = LoadPReLuBlob;
     // blob_load_map["Normalize"]                  = LoadNormalizeBlob;
     // blob_load_map["ConvolutionDepthwise"]       = LoadConvolutionBlob;
     // blob_load_map["DepthwiseConvolution"]       = LoadConvolutionBlob;
-    blob_load_map["Bias"]                       = LoadBiasBlob;
-    #if 0
+    blob_load_map["Bias"] = LoadBiasBlob;
+#if 0
     op_load_map["Data"]                         = std::pair<int, op_load_t>(OP_INPUT,           load_data);
     op_load_map["Slice"]                        = std::pair<int, op_load_t>(OP_SLICE,           load_slice);
     op_load_map["Concat"]                       = std::pair<int, op_load_t>(OP_CONCAT,          load_concat);
@@ -1087,8 +1069,7 @@ void caffe_serializer::register_op_load()
     op_load_map["MVN"]                          = std::pair<int, op_load_t>(OP_MVN,             load_mvn);
     op_load_map["Reduction"]                    = std::pair<int, op_load_t>(OP_REDUCTION,       load_reduction);
     op_load_map["Bias"]                         = std::pair<int, op_load_t>(OP_BIAS,            load_bias);
-    #endif
-
+#endif
 }
 /*
 *   OPERAOTR REGISTER FUNCTION DEFINE FOR ONNX SERIALIZER END
diff --git a/tools/convert_tool/caffe/caffe2tengine.hpp b/tools/convert_tool/caffe/caffe2tengine.hpp
index 844623bfc..1a5ca52d9 100644
--- a/tools/convert_tool/caffe/caffe2tengine.hpp
+++ b/tools/convert_tool/caffe/caffe2tengine.hpp
@@ -36,19 +36,18 @@
 #include <google/protobuf/text_format.h>
 #include <google/protobuf/message.h>
 
-extern "C" 
-{
-    #include "tengine/c_api.h"
-    #include "graph/graph.h"
-    #include "graph/subgraph.h"
-    #include "graph/node.h"
-    #include "graph/tensor.h"
-    #include "executer/executer.h"
-    #include "module/module.h"
-    #include "utility/log.h"
-    #include "utility/sys_port.h"
-    #include "utility/vector.h"
-    #include "../utils/save_graph/op_include.h"
+extern "C" {
+#include "tengine/c_api.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "executer/executer.h"
+#include "module/module.h"
+#include "utility/log.h"
+#include "utility/sys_port.h"
+#include "utility/vector.h"
+#include "../utils/save_graph/op_include.h"
 }
 
 enum PoolArg
@@ -63,24 +62,22 @@ class caffe_serializer
 public:
     graph_t caffe2tengine(std::string model_file, std::string proto_file);
     typedef int (*op_load_t)(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param);
-    typedef int (*blob_load_t)(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param);    
+    typedef int (*blob_load_t)(ir_graph_t* graph, ir_node_t* node, const te_caffe::LayerParameter& layer_param);
     using name_map_t = std::unordered_map<std::string, std::string>;
 
 private:
-    std::unordered_map<std::string, std::pair<int, op_load_t>> op_load_map;
+    std::unordered_map<std::string, std::pair<int, op_load_t> > op_load_map;
     std::unordered_map<std::string, blob_load_t> blob_load_map;
     int load_model(ir_graph_t* graph, std::string model_file, std::string proto_file);
     int load_graph_node(ir_graph_t* graph, const te_caffe::NetParameter test_net, const te_caffe::NetParameter train_net);
     int load_tensor_data(ir_graph_t* graph, const te_caffe::NetParameter test_net, const te_caffe::NetParameter train_net);
     int load_text_file(std::string model_file, te_caffe::NetParameter& caffe_net);
     int load_binary_file(std::string model_file, te_caffe::NetParameter& caffe_net);
-    bool find_op_load_method(const std::string& op_name);  
+    bool find_op_load_method(const std::string& op_name);
     void register_op_load();
 
     std::unordered_map<std::string, const te_caffe::LayerParameter*> train_name_map;
     std::vector<ir_tensor_t*> output_tensors;
-    
 };
 
-
 #endif
\ No newline at end of file
diff --git a/tools/convert_tool/convert_tool.cpp b/tools/convert_tool/convert_tool.cpp
index 8220ff302..a3d55374a 100644
--- a/tools/convert_tool/convert_tool.cpp
+++ b/tools/convert_tool/convert_tool.cpp
@@ -34,11 +34,11 @@
 #include "utils/graph_optimizer/graph_opt.hpp"
 
 const char* help_params = "[Convert Tools Info]: optional arguments:\n"
-                      "\t-h    help            show this help message and exit\n"
-                      "\t-f    input type      path to input float32 tmfile\n"
-                      "\t-p    input structure path to the network structure of input model(*.prototxt, *.symbol, *.cfg, *.pdmodel)\n"
-                      "\t-m    input params    path to the network params of input model(*.caffemodel, *.params, *.weight, *.pb, *.onnx, *.tflite, *.pdiparams)\n"
-                      "\t-o    output model    path to output fp32 tmfile\n";
+                          "\t-h    help            show this help message and exit\n"
+                          "\t-f    input type      path to input float32 tmfile\n"
+                          "\t-p    input structure path to the network structure of input model(*.prototxt, *.symbol, *.cfg, *.pdmodel)\n"
+                          "\t-m    input params    path to the network params of input model(*.caffemodel, *.params, *.weight, *.pb, *.onnx, *.tflite, *.pdiparams)\n"
+                          "\t-o    output model    path to output fp32 tmfile\n";
 
 const char* example_params = "[Convert Tools Info]: example arguments:\n"
                              "\t./convert_tool -f caffe -p ./mobilenet.prototxt -m ./mobilenet.caffemodel -o ./mobilenet.tmfile\n";
@@ -64,24 +64,24 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'f':
-                file_format = optarg;
-                break;
-            case 'p':
-                proto_file = optarg;
-                break;
-            case 'm':
-                model_file = optarg;
-                break;
-            case 'o':
-                output_tmfile = optarg;
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                show_usage();
-                break;
+        case 'f':
+            file_format = optarg;
+            break;
+        case 'p':
+            proto_file = optarg;
+            break;
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'o':
+            output_tmfile = optarg;
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            show_usage();
+            break;
         }
     }
 
@@ -105,8 +105,7 @@ int main(int argc, char* argv[])
             model_file_needed = true;
             input_file_number = 2;
         }
-        else if (file_format == "caffe_single" || file_format == "onnx" || file_format == "tensorflow" ||
-                 file_format == "tflite")
+        else if (file_format == "caffe_single" || file_format == "onnx" || file_format == "tensorflow" || file_format == "tflite")
         {
             model_file_needed = true;
             input_file_number = 1;
@@ -160,7 +159,7 @@ int main(int argc, char* argv[])
             return -1;
         }
     }
-    
+
     init_tengine();
     set_log_level(LOG_INFO);
     graph_t graph = NULL;
@@ -176,8 +175,8 @@ int main(int argc, char* argv[])
     }
     else if (file_format == "ncnn")
     {
-	ncnn_serializer n2t;
-	graph = n2t.ncnn2tengine(model_file, proto_file);
+        ncnn_serializer n2t;
+        graph = n2t.ncnn2tengine(model_file, proto_file);
     }
     else
     {
@@ -196,7 +195,7 @@ int main(int argc, char* argv[])
         fprintf(stderr, "optimize graph failed! \n");
         return -1;
     }
-    
+
     if (save_graph(graph, output_tmfile.c_str()) < 0)
     {
         fprintf(stderr, "save graph failed! \n");
diff --git a/tools/convert_tool/ncnn/ncnn2tengine.cpp b/tools/convert_tool/ncnn/ncnn2tengine.cpp
index f78b67d1d..f475a7230 100644
--- a/tools/convert_tool/ncnn/ncnn2tengine.cpp
+++ b/tools/convert_tool/ncnn/ncnn2tengine.cpp
@@ -28,7 +28,7 @@
 *   SELF DEFINE VARIABLE
 *   FOR ONNX SERIALIZER
 */
-const int OP_VERSION=1;
+const int OP_VERSION = 1;
 
 /*
 *   ASSIST FUNCTIONS FOR NCNN SERIALIZER START
@@ -130,21 +130,30 @@ int ncnn_serializer::read(void* buf, int size)
 {
     return fread(buf, 1, size, fp);
 }
-void remove_ncnn_split(std::vector<NcnnNode>& nodelist){
-    for(auto &curr_node : nodelist){
-        if(curr_node.op == "Split"){
-            for(auto &in_node : nodelist){
-                if(in_node.output_name[0] == curr_node.inputs_name[0]){
+void remove_ncnn_split(std::vector<NcnnNode>& nodelist)
+{
+    for (auto& curr_node : nodelist)
+    {
+        if (curr_node.op == "Split")
+        {
+            for (auto& in_node : nodelist)
+            {
+                if (in_node.output_name[0] == curr_node.inputs_name[0])
+                {
                     auto out_name = in_node.output_name[0];
-                    for(auto &out_node : nodelist){
-                        for(auto &out_node_inbound_name : out_node.inputs_name){
-                            for(auto &curr_node_outbound_name : curr_node.output_name){
-                                if(out_node_inbound_name == curr_node_outbound_name){
+                    for (auto& out_node : nodelist)
+                    {
+                        for (auto& out_node_inbound_name : out_node.inputs_name)
+                        {
+                            for (auto& curr_node_outbound_name : curr_node.output_name)
+                            {
+                                if (out_node_inbound_name == curr_node_outbound_name)
+                                {
                                     out_node.inputs_name.erase(std::remove(
-                                        out_node.inputs_name.begin(),
-                                        out_node.inputs_name.end(),
-                                        out_node_inbound_name
-                                    ), out_node.inputs_name.end());
+                                                                   out_node.inputs_name.begin(),
+                                                                   out_node.inputs_name.end(),
+                                                                   out_node_inbound_name),
+                                                               out_node.inputs_name.end());
                                     out_node.inputs_name.push_back(in_node.output_name[0]);
                                 }
                             }
@@ -154,7 +163,7 @@ void remove_ncnn_split(std::vector<NcnnNode>& nodelist){
             }
         }
     }
-    nodelist.erase(std::remove_if(nodelist.begin(), nodelist.end(), [&](NcnnNode& n){return n.op == "Split";}), nodelist.end());
+    nodelist.erase(std::remove_if(nodelist.begin(), nodelist.end(), [&](NcnnNode& n) { return n.op == "Split"; }), nodelist.end());
 }
 int ncnn_serializer::load_model_file(const char* fname, std::vector<NcnnNode>& nodelist)
 {
@@ -169,10 +178,10 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector<NcnnNode>& n
     int res = 0;
     int magic = 0;
     res = fscanf(fp, "%d=", &magic);
-    fprintf(stderr, "%s magic: %d \n",fname, magic);
+    fprintf(stderr, "%s magic: %d \n", fname, magic);
     if (magic != 7767517)
     {
-         TLOG_ERR("param is too old, please regenerate \n");
+        TLOG_ERR("param is too old, please regenerate \n");
     }
     int layer_count = 0;
     int blob_count = 0;
@@ -196,7 +205,6 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector<NcnnNode>& n
         node.optimized = 0;
         node.name = layer_name;
 
-
         for (int j = 0; j < bottom_count; j++)
         {
             char bottom_name[256];
@@ -213,7 +221,7 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector<NcnnNode>& n
 
         if (res < 0)
         {
-             TLOG_ERR( "Read Param file data failed\n");
+            TLOG_ERR("Read Param file data failed\n");
             return false;
         }
         while (fscanf(fp, "%d=", &id) == 1)
@@ -238,8 +246,8 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector<NcnnNode>& n
                     return false;
                 }
 
-                params[id].f_data_array = ( float* )malloc(sizeof(float) * len);
-                params[id].i_data_array = ( int* )malloc(sizeof(int) * len);
+                params[id].f_data_array = (float*)malloc(sizeof(float) * len);
+                params[id].i_data_array = (int*)malloc(sizeof(int) * len);
                 // std::vector<std::string> opt_str;
                 std::string str = "";
                 for (int j = 0; j < len; j++)
@@ -292,8 +300,8 @@ int ncnn_serializer::load_model_file(const char* fname, std::vector<NcnnNode>& n
                         return false;
                     }
                     std::string str = "";
-                    params[id].f_data_array = ( float* )malloc(sizeof(float) * len);
-                    params[id].i_data_array = ( int* )malloc(sizeof(int) * len);
+                    params[id].f_data_array = (float*)malloc(sizeof(float) * len);
+                    params[id].i_data_array = (int*)malloc(sizeof(int) * len);
                     for (int j = 0; j < len; j++)
                     {
                         char vstr[16];
@@ -376,16 +384,15 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
     fp = fopen(fname, "rb");
     if (!fp)
     {
-         TLOG_ERR("Cannot open the bin file: %d\n ");
+        TLOG_ERR("Cannot open the bin file: %d\n ");
         return false;
     }
 
     float magic = 0;
     int nscan = 0;
-    for (int i = 0; i < ( int )nodelist.size(); i++)
+    for (int i = 0; i < (int)nodelist.size(); i++)
     {
-        if (nodelist[i].op == "Convolution" || nodelist[i].op == "DeconvolutionDepthWise" ||
-            nodelist[i].op == "Deconvolution" || nodelist[i].op == "ConvolutionDepthWise")
+        if (nodelist[i].op == "Convolution" || nodelist[i].op == "DeconvolutionDepthWise" || nodelist[i].op == "Deconvolution" || nodelist[i].op == "ConvolutionDepthWise")
         {
             NcnnParam weight;
             nscan = read(&magic, sizeof(float));
@@ -398,7 +405,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
             iter = nodelist[i].attrs.find(0);
             int output_channel = std::atoi(iter->second.c_str());
 
-            weight.data = ( float* )malloc(sizeof(float) * weight.data_len);
+            weight.data = (float*)malloc(sizeof(float) * weight.data_len);
             read(weight.data, sizeof(float) * weight.data_len);
             // printf("%f %f \n", weight.data, weight.data);
             iter = nodelist[i].attrs.find(1);
@@ -410,8 +417,8 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
             weight.dims.push_back(kernel_size);
             iter = nodelist[i].attrs.find(5);
             int biasTerm = 0;
-            
-            if(!iter->second.empty())
+
+            if (!iter->second.empty())
                 biasTerm = std::atoi(iter->second.c_str());
 
             paramlist.push_back(weight);
@@ -420,7 +427,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
                 NcnnParam bias;
                 bias.name = nodelist[i].name + "_b";
                 bias.data_len = output_channel;
-                bias.data = ( float* )malloc(sizeof(float) * output_channel);
+                bias.data = (float*)malloc(sizeof(float) * output_channel);
                 read(bias.data, sizeof(float) * output_channel);
                 bias.dims.push_back(output_channel);
                 paramlist.push_back(bias);
@@ -442,10 +449,10 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
             variance.data_len = std::atoi(iter->second.c_str());
             bias.data_len = std::atoi(iter->second.c_str());
 
-            bias.data = ( float* )malloc(sizeof(float) * slope.data_len);
-            variance.data = ( float* )malloc(sizeof(float) * slope.data_len);
-            slope.data = ( float* )malloc(sizeof(float) * slope.data_len);
-            mean.data = ( float* )malloc(sizeof(float) * slope.data_len);
+            bias.data = (float*)malloc(sizeof(float) * slope.data_len);
+            variance.data = (float*)malloc(sizeof(float) * slope.data_len);
+            slope.data = (float*)malloc(sizeof(float) * slope.data_len);
+            mean.data = (float*)malloc(sizeof(float) * slope.data_len);
 
             read(slope.data, sizeof(float) * slope.data_len);
             read(mean.data, sizeof(float) * slope.data_len);
@@ -474,8 +481,8 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
             iter = nodelist[i].attrs.find(0);
             bias.data_len = std::atoi(iter->second.c_str());
 
-            weight.data = ( float* )malloc(sizeof(float) * weight.data_len);
-            bias.data = ( float* )malloc(sizeof(float) * bias.data_len);
+            weight.data = (float*)malloc(sizeof(float) * weight.data_len);
+            bias.data = (float*)malloc(sizeof(float) * bias.data_len);
             read(weight.data, sizeof(float) * weight.data_len);
             read(bias.data, sizeof(float) * bias.data_len);
             weight.dims.push_back(weight.data_len);
@@ -494,7 +501,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
             iter = nodelist[i].attrs.find(2);
             weight.data_len = std::atoi(iter->second.c_str());
 
-            weight.data = ( float* )malloc(sizeof(float) * weight.data_len);
+            weight.data = (float*)malloc(sizeof(float) * weight.data_len);
             read(weight.data, sizeof(float) * weight.data_len);
             weight.dims.push_back(output_num);
             weight.dims.push_back(weight.data_len / output_num);
@@ -506,7 +513,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
                 NcnnParam bias;
                 bias.name = nodelist[i].name + "_b";
                 bias.data_len = output_num;
-                bias.data = ( float* )malloc(sizeof(float) * output_num);
+                bias.data = (float*)malloc(sizeof(float) * output_num);
                 read(bias.data, sizeof(float) * output_num);
                 bias.dims.push_back(output_num);
                 paramlist.push_back(bias);
@@ -520,7 +527,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
             std::map<int, std::string>::iterator iter;
             iter = nodelist[i].attrs.find(3);
             scale.data_len = std::atoi(iter->second.c_str());
-            scale.data = ( float* )malloc(sizeof(float) * scale.data_len);
+            scale.data = (float*)malloc(sizeof(float) * scale.data_len);
             read(scale.data, sizeof(float) * scale.data_len);
             scale.dims.push_back(scale.data_len);
             paramlist.push_back(scale);
@@ -533,7 +540,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
             std::map<int, std::string>::iterator iter;
             iter = nodelist[i].attrs.find(0);
             slope.data_len = std::atoi(iter->second.c_str());
-            slope.data = ( float* )malloc(sizeof(float) * slope.data_len);
+            slope.data = (float*)malloc(sizeof(float) * slope.data_len);
             read(slope.data, sizeof(float) * slope.data_len);
             slope.dims.push_back(slope.data_len);
             paramlist.push_back(slope);
@@ -546,7 +553,7 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
             std::map<int, std::string>::iterator iter;
             iter = nodelist[i].attrs.find(0);
             scale.data_len = std::atoi(iter->second.c_str());
-            scale.data = ( float* )malloc(sizeof(float) * scale.data_len);
+            scale.data = (float*)malloc(sizeof(float) * scale.data_len);
             read(scale.data, sizeof(float) * scale.data_len);
             scale.dims.push_back(scale.data_len);
             paramlist.push_back(scale);
@@ -558,36 +565,37 @@ int ncnn_serializer::load_binary_file(const char* fname, std::vector<NcnnParam>&
                 NcnnParam bias;
                 bias.name = nodelist[i].name + "_b";
                 bias.data_len = scale.data_len;
-                bias.data = ( float* )malloc(sizeof(float) * scale.data_len);
+                bias.data = (float*)malloc(sizeof(float) * scale.data_len);
                 read(bias.data, sizeof(float) * scale.data_len);
                 bias.dims.push_back(scale.data_len);
                 paramlist.push_back(bias);
-            }   
+            }
         }
-        else if(nodelist[i].op == "MemoryData"){
+        else if (nodelist[i].op == "MemoryData")
+        {
             NcnnParam const_data;
             std::map<int, std::string>::iterator iter;
             int data_len = 1;
             int size = (int)nodelist[i].attrs.size();
             std::vector<int> dims(size);
-            for(iter = nodelist[i].attrs.begin(); iter != nodelist[i].attrs.end(); iter++)
+            for (iter = nodelist[i].attrs.begin(); iter != nodelist[i].attrs.end(); iter++)
             {
                 std::pair<int, std::string> pair = *iter;
                 data_len *= atoi(pair.second.c_str());
                 dims[pair.first] = atoi(pair.second.c_str());
             }
             const_data.name = nodelist[i].name;
-            const_data.dim_size = (int) dims.size();
+            const_data.dim_size = (int)dims.size();
             const_data.dims = dims;
             const_data.data_len = data_len;
-            const_data.data = (float*)malloc(sizeof(float)*data_len);
-            read(const_data.data, sizeof(float)* data_len);
+            const_data.data = (float*)malloc(sizeof(float) * data_len);
+            read(const_data.data, sizeof(float) * data_len);
             paramlist.push_back(const_data);
         }
     }
     if (nscan < 0)
     {
-       TLOG_ERR( "Cannot read the binary file: %s \n " , fname );
+        TLOG_ERR("Cannot read the binary file: %s \n ", fname);
     }
 #if 0
     printf("total size: %d \n", totalSize);
@@ -619,7 +627,7 @@ int ncnn_serializer::load_constant_tensor(ir_graph_t* graph, const std::vector<N
 #endif
         std::vector<int> dims = ncnn_tensor.dims;
         ir_tensor_t* ir_tensor = create_ir_tensor(graph, ncnn_tensor.name.c_str(), TENGINE_DT_FP32);
-        int *tensor_dims = new int[(int)dims.size()];
+        int* tensor_dims = new int[(int)dims.size()];
         for (int j = 0; j < (int)dims.size(); j++)
         {
             tensor_dims[j] = ncnn_tensor.dims[j];
@@ -629,8 +637,8 @@ int ncnn_serializer::load_constant_tensor(ir_graph_t* graph, const std::vector<N
         int tensor_size = ncnn_tensor.data_len * sizeof(float);
         ir_tensor->data = (float*)malloc(tensor_size);
 
-        float* mem_buf = ( float* )ir_tensor->data;
-        float* raw_data = ( float* )ncnn_tensor.data;
+        float* mem_buf = (float*)ir_tensor->data;
+        float* raw_data = (float*)ncnn_tensor.data;
         /* load data */
         for (int k = 0; k < ncnn_tensor.data_len; k++)
         {
@@ -639,7 +647,7 @@ int ncnn_serializer::load_constant_tensor(ir_graph_t* graph, const std::vector<N
         ir_node_t* ir_node = create_ir_node(graph, ncnn_tensor.name.c_str(), OP_CONST, OP_VERSION);
         set_ir_node_output_tensor(ir_node, 0, ir_tensor);
     }
-    
+
     return 0;
 }
 
@@ -673,7 +681,8 @@ float ParseNumber(const char* s, float d)
         bNegtiveBase = true;
         s++;
     }
-    for (; '0' == *s; nPreZero++, s++);
+    for (; '0' == *s; nPreZero++, s++)
+        ;
     for (; *s != '.' && *s != 'e' && *s != 'E' && *s != '\0'; s++)
     {
         if (*s < '0' || *s > '9')
@@ -775,7 +784,7 @@ int ncnn_serializer::set_graph_input(ir_graph_t* graph, const std::vector<NcnnNo
     for (unsigned int i = 0; i < nodelist.size(); i++)
     {
         const NcnnNode& ncnn_node = nodelist.at(i);
-        if(ncnn_node.op == "Input")
+        if (ncnn_node.op == "Input")
         {
             std::string input_name = ncnn_node.name;
 
@@ -785,13 +794,14 @@ int ncnn_serializer::set_graph_input(ir_graph_t* graph, const std::vector<NcnnNo
             if (GetParam(input_name, paramlist, param))
             {
                 std::vector<int> ir_dims = param.dims;
-                int *tensor_dims = new int[ir_dims.size()];
+                int* tensor_dims = new int[ir_dims.size()];
                 for (int j = 0; j < ir_dims.size(); j++)
                 {
                     tensor_dims[j] = ir_dims[j];
                 }
-                if (ir_dims.size() > 0);
-                    set_ir_tensor_shape(ir_tensor, tensor_dims, ir_dims.size());
+                if (ir_dims.size() > 0)
+                    ;
+                set_ir_tensor_shape(ir_tensor, tensor_dims, ir_dims.size());
             }
             ir_node_t* node = create_ir_node(graph, input_name.c_str(), OP_INPUT, OP_VERSION);
             set_ir_node_output_tensor(node, 0, ir_tensor);
@@ -820,19 +830,20 @@ int ncnn_serializer::set_graph_output(ir_graph_t* graph, const std::vector<NcnnN
         int tensor_id = get_ir_tensor_index_from_name(graph, input_name.c_str());
         ir_tensor_t* ir_tensor = get_ir_graph_tensor(graph, tensor_id);
 
-        if(ir_tensor->consumer_num == 0){
-
+        if (ir_tensor->consumer_num == 0)
+        {
             NcnnParam param;
             if (GetParam(input_name, paramlist, param))
             {
                 std::vector<int> ir_dims = param.dims;
-                int *tensor_dims = new int[ir_dims.size()];
+                int* tensor_dims = new int[ir_dims.size()];
                 for (int j = 0; j < ir_dims.size(); j++)
                 {
                     tensor_dims[j] = ir_dims[j];
                 }
-                if (ir_dims.size() > 0);
-                    set_ir_tensor_shape(ir_tensor, tensor_dims, ir_dims.size());
+                if (ir_dims.size() > 0)
+                    ;
+                set_ir_tensor_shape(ir_tensor, tensor_dims, ir_dims.size());
             }
 
             ir_node_t* node = create_ir_node(graph, input_name.c_str(), OP_INPUT, OP_VERSION);
@@ -851,7 +862,7 @@ int ncnn_serializer::set_graph_output(ir_graph_t* graph, const std::vector<NcnnN
 
 bool ncnn_serializer::find_op_load_method(const std::string& op_name)
 {
-    if(op_load_map.count(op_name))
+    if (op_load_map.count(op_name))
         return true;
 
     return false;
@@ -864,7 +875,7 @@ ir_tensor_t* ncnn_serializer::find_tensor(ir_graph_t* graph, const std::string&
         if (tensor->name == tensor_name)
             return tensor;
     }
-    
+
     return nullptr;
 }
 int ncnn_serializer::load_graph_node(ir_graph_t* graph, const std::vector<NcnnNode>& nodelist, const std::vector<NcnnParam>& paramlist)
@@ -875,8 +886,8 @@ int ncnn_serializer::load_graph_node(ir_graph_t* graph, const std::vector<NcnnNo
     for (i = 0; i < nodelist.size(); i++)
     {
         NcnnNode ncnn_node = nodelist.at(i);
-		if(ncnn_node.op == "Noop" && ncnn_node.output_name.size() == 0)
-		            node_to_remove.push_back(i);
+        if (ncnn_node.op == "Noop" && ncnn_node.output_name.size() == 0)
+            node_to_remove.push_back(i);
         if (!find_op_load_method(ncnn_node.op))
         {
             auto it = find(no_supported_op.begin(), no_supported_op.end(), ncnn_node.op);
@@ -887,10 +898,10 @@ int ncnn_serializer::load_graph_node(ir_graph_t* graph, const std::vector<NcnnNo
     for (i = 0; i < nodelist.size(); i++)
     {
         NcnnNode ncnn_node = nodelist.at(i);
-        if(ncnn_node.op == "Input" || ncnn_node.op == "MemoryData")
+        if (ncnn_node.op == "Input" || ncnn_node.op == "MemoryData")
             continue;
 
-        if(ncnn_node.op == "Noop" && ncnn_node.output_name.size() == 0)
+        if (ncnn_node.op == "Noop" && ncnn_node.output_name.size() == 0)
             continue;
 
         ir_node_t* ir_node = nullptr;
@@ -901,35 +912,35 @@ int ncnn_serializer::load_graph_node(ir_graph_t* graph, const std::vector<NcnnNo
             return -1;
         }
         int input_number = ncnn_node.inputs_name.size();
-        int size = ( int )nodelist.size();
+        int size = (int)nodelist.size();
         int in_num = 0;
         for (in_num = 0; in_num < input_number; in_num++)
         {
             std::string input_name = ncnn_node.inputs_name[in_num];
-            
+
             int tensor_id = get_ir_tensor_index_from_name(graph, input_name.c_str());
             ir_tensor_t* ir_tensor = get_ir_graph_tensor(graph, tensor_id);
             if (ir_tensor == NULL)
             {
                 fprintf(stderr, "Can not find tensor : %s \n", input_name.c_str());
             }
-            set_ir_node_input_tensor(ir_node, in_num, ir_tensor);     
-            size = ( int )paramlist.size();
+            set_ir_node_input_tensor(ir_node, in_num, ir_tensor);
+            size = (int)paramlist.size();
             int tensor_idx = 0;
-            for (int j = 0 ; j < size; j++)
+            for (int j = 0; j < size; j++)
             {
                 std::string input_name = paramlist[j].name;
                 std::string name = input_name.substr(0, input_name.length() - 2);
                 if (name == ncnn_node.name)
                 {
-                    tensor_idx++; 
+                    tensor_idx++;
                     ir_tensor_t* tensor = find_tensor(graph, paramlist[j].name);
-                    set_ir_node_input_tensor(ir_node, tensor_idx, tensor);   
+                    set_ir_node_input_tensor(ir_node, tensor_idx, tensor);
                 }
-            } 
+            }
         }
 
-        int out_size = ( int )ncnn_node.output_name.size();
+        int out_size = (int)ncnn_node.output_name.size();
         for (int j = 0; j < out_size; j++)
         {
             const std::string& output_name = ncnn_node.output_name[j];
@@ -972,7 +983,7 @@ int ncnn_serializer::load_model(ir_graph_t* graph, std::string bin_file, std::st
     if (load_constant_tensor(graph, nodelist, paramlist) < 0)
         return -1;
     fprintf(stderr, "Process 3: Finish load tensor data \n");
-    if (set_graph_input(graph, nodelist,paramlist ) < 0)
+    if (set_graph_input(graph, nodelist, paramlist) < 0)
         return -1;
     fprintf(stderr, "Process 4: Finish load graph input node \n");
     if (load_graph_node(graph, nodelist, paramlist) < 0)
@@ -1009,12 +1020,10 @@ graph_t ncnn_serializer::ncnn2tengine(std::string model_file, std::string proto_
     return ir_graph;
 }
 
-
-
 typedef std::map<int, std::string>::const_iterator const_iterator;
 int load_conv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct conv_param* param = ( struct conv_param* )node->op.param_mem;
+    struct conv_param* param = (struct conv_param*)node->op.param_mem;
 
     const_iterator iter;
 
@@ -1082,7 +1091,7 @@ int load_conv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_pool(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct pool_param* param = ( struct pool_param* )node->op.param_mem;
+    struct pool_param* param = (struct pool_param*)node->op.param_mem;
     const_iterator iter;
     iter = ncnn_node.attrs.find(0);
     if (iter != ncnn_node.attrs.end())
@@ -1128,13 +1137,13 @@ int load_pool(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_relu(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct relu_param* relu_param = ( struct relu_param* )node->op.param_mem;
+    struct relu_param* relu_param = (struct relu_param*)node->op.param_mem;
 
     return 0;
 }
 int load_concat(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct concat_param* param = ( struct concat_param* )node->op.param_mem;
+    struct concat_param* param = (struct concat_param*)node->op.param_mem;
     const_iterator iter;
     iter = ncnn_node.attrs.find(0);
     if (iter != ncnn_node.attrs.end())
@@ -1144,7 +1153,7 @@ int load_concat(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_softmax(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct softmax_param* param = ( struct softmax_param* )node->op.param_mem;
+    struct softmax_param* param = (struct softmax_param*)node->op.param_mem;
     const_iterator iter;
 
     iter = ncnn_node.attrs.find(0);
@@ -1164,7 +1173,7 @@ int load_no_param(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_bn(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct batchnorm_param* param = ( struct batchnorm_param* )node->op.param_mem;
+    struct batchnorm_param* param = (struct batchnorm_param*)node->op.param_mem;
 
     const_iterator iter;
 
@@ -1176,7 +1185,7 @@ int load_bn(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_scale(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct scale_param* param = ( struct scale_param* )node->op.param_mem;
+    struct scale_param* param = (struct scale_param*)node->op.param_mem;
 
     const_iterator iter;
 
@@ -1188,7 +1197,7 @@ int load_scale(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_clip(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct clip_param* param = ( struct clip_param* )node->op.param_mem;
+    struct clip_param* param = (struct clip_param*)node->op.param_mem;
     const_iterator iter;
 
     iter = ncnn_node.attrs.find(1);
@@ -1203,7 +1212,7 @@ int load_clip(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_fc(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct fc_param* param = ( struct fc_param* )node->op.param_mem;
+    struct fc_param* param = (struct fc_param*)node->op.param_mem;
 
     const_iterator iter;
 
@@ -1215,8 +1224,7 @@ int load_fc(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_flatten(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct flatten_param* param = ( struct flatten_param* )node->op.param_mem;
-
+    struct flatten_param* param = (struct flatten_param*)node->op.param_mem;
 
     param->axis = 1;
 
@@ -1224,7 +1232,7 @@ int load_flatten(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_reshape(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct reshape_param* param = ( struct reshape_param* )node->op.param_mem;
+    struct reshape_param* param = (struct reshape_param*)node->op.param_mem;
     std::vector<int> dim_shape;
     const_iterator iter;
     iter = ncnn_node.attrs.find(3);
@@ -1246,8 +1254,10 @@ int load_reshape(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
         {
             dim_shape.push_back(std::atoi(iter->second.c_str()));
         }
-    }else {
-       dim_shape.push_back(0);    
+    }
+    else
+    {
+        dim_shape.push_back(0);
     }
     iter = ncnn_node.attrs.find(1);
     if (iter != ncnn_node.attrs.end())
@@ -1270,7 +1280,7 @@ int load_reshape(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
     param->re_shape = (int*)sys_malloc(sizeof(int) * size);
     param->dim_size = size;
     for (int i = 0; i < size; i++)
-    { 
+    {
         param->re_shape[i] = dim_shape[i];
     }
 
@@ -1278,7 +1288,7 @@ int load_reshape(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_eltwise(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct eltwise_param* param = ( struct eltwise_param* )node->op.param_mem;
+    struct eltwise_param* param = (struct eltwise_param*)node->op.param_mem;
     const_iterator iter;
 
     std::vector<float> coef;
@@ -1309,7 +1319,7 @@ int load_eltwise(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_resize(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct interp_param* param = ( struct interp_param* )node->op.param_mem;
+    struct interp_param* param = (struct interp_param*)node->op.param_mem;
 
     std::vector<float> v1, v2;
     const_iterator iter;
@@ -1323,7 +1333,9 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
     {
         ParseAttr_n(iter->second, v1);
         param->width_scale = v1.at(0);
-    } else {
+    }
+    else
+    {
         param->width_scale = 0;
     }
     iter = ncnn_node.attrs.find(2);
@@ -1331,16 +1343,20 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
     {
         ParseAttr_n(iter->second, v2);
         param->height_scale = v2.at(0);
-    } else {
+    }
+    else
+    {
         param->height_scale = 0;
     }
     iter = ncnn_node.attrs.find(3);
-    if(iter != ncnn_node.attrs.end()){
+    if (iter != ncnn_node.attrs.end())
+    {
         ParseAttr_n(iter->second, v2);
         param->output_width = v2.at(0);
     }
     iter = ncnn_node.attrs.find(4);
-    if(iter != ncnn_node.attrs.end()){
+    if (iter != ncnn_node.attrs.end())
+    {
         ParseAttr_n(iter->second, v2);
         param->output_height = v2.at(0);
     }
@@ -1348,7 +1364,7 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 }
 int load_slice(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct slice_param* param = ( struct slice_param* )node->op.param_mem;
+    struct slice_param* param = (struct slice_param*)node->op.param_mem;
     // param->isncnn= true;
     param->iscaffe = false;
     param->ismxnet = false;
@@ -1357,75 +1373,78 @@ int load_slice(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
     const_iterator iter;
     iter = ncnn_node.attrs.find(0);
     std::vector<float> v1;
-    if(iter != ncnn_node.attrs.end()){
+    if (iter != ncnn_node.attrs.end())
+    {
         ParseAttr_n(iter->second, v1);
         std::vector<int> slice_shape;
-        for(int i = 0; i < (int)v1.size(); i++){
+        for (int i = 0; i < (int)v1.size(); i++)
+        {
             // param->slice_point_.push_back((int)v1.at(i));
         }
     }
     iter = ncnn_node.attrs.find(1);
-    if(iter != ncnn_node.attrs.end()){
-        param->axis = std::atoi(iter->second.c_str())+1;
+    if (iter != ncnn_node.attrs.end())
+    {
+        param->axis = std::atoi(iter->second.c_str()) + 1;
     }
     return 0;
 }
 
 int load_unary(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct unary_param* param = ( struct unary_param* )node->op.param_mem;
+    struct unary_param* param = (struct unary_param*)node->op.param_mem;
     const_iterator iter;
     iter = ncnn_node.attrs.find(0);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
         param->type = std::atoi(iter->second.c_str());
-    
+
     return 0;
 }
 int load_deconv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 {
-    struct deconv_param* param = ( struct deconv_param* )node->op.param_mem;
+    struct deconv_param* param = (struct deconv_param*)node->op.param_mem;
     const_iterator iter;
     std::vector<float> v1;
     iter = ncnn_node.attrs.find(0);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->num_output = std::atoi(iter->second.c_str());
     }
     iter = ncnn_node.attrs.find(1);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->kernel_w = std::atoi(iter->second.c_str());
         param->kernel_h = std::atoi(iter->second.c_str());
     }
     iter = ncnn_node.attrs.find(11);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->kernel_h = std::atoi(iter->second.c_str());
     }
     iter = ncnn_node.attrs.find(2);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->dilation_w = std::atoi(iter->second.c_str());
         param->dilation_h = std::atoi(iter->second.c_str());
     }
     iter = ncnn_node.attrs.find(12);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->dilation_h = std::atoi(iter->second.c_str());
     }
     iter = ncnn_node.attrs.find(3);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->stride_h = std::atoi(iter->second.c_str());
         param->stride_w = std::atoi(iter->second.c_str());
     }
     iter = ncnn_node.attrs.find(13);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->stride_w = std::atoi(iter->second.c_str());
-    }   
+    }
     iter = ncnn_node.attrs.find(4);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->pad_w0 = std::atoi(iter->second.c_str());
         param->pad_w1 = std::atoi(iter->second.c_str());
@@ -1433,22 +1452,22 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
         param->pad_h1 = std::atoi(iter->second.c_str());
     }
     iter = ncnn_node.attrs.find(15);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->pad_w1 = std::atoi(iter->second.c_str());
-    }   
+    }
     iter = ncnn_node.attrs.find(16);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->pad_h0 = std::atoi(iter->second.c_str());
-    }   
+    }
     iter = ncnn_node.attrs.find(17);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->pad_h1 = std::atoi(iter->second.c_str());
     }
     iter = ncnn_node.attrs.find(7);
-    if(iter != ncnn_node.attrs.end())
+    if (iter != ncnn_node.attrs.end())
     {
         param->group = std::atoi(iter->second.c_str());
     }
@@ -1460,26 +1479,26 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const NcnnNode& ncnn_node)
 */
 void ncnn_serializer::register_op_load()
 {
-    op_load_map["Convolution"]                          = std::pair<int, op_load_t>(OP_CONV,            load_conv);
-    op_load_map["ConvolutionDepthWise"]                 = std::pair<int, op_load_t>(OP_CONV,            load_conv);
-    op_load_map["Pooling"]                              = std::pair<int, op_load_t>(OP_POOL,            load_pool);
-    op_load_map["ReLU"]                                 = std::pair<int, op_load_t>(OP_RELU,            load_relu);
-    op_load_map["Concat"]                               = std::pair<int, op_load_t>(OP_CONCAT,          load_concat);
-    op_load_map["Softmax"]                              = std::pair<int, op_load_t>(OP_SOFTMAX,         load_softmax);
-    op_load_map["Dropout"]                              = std::pair<int, op_load_t>(OP_DROPOUT,         load_no_param);
-    op_load_map["BatchNorm"]                            = std::pair<int, op_load_t>(OP_BATCHNORM,       load_bn);
-    op_load_map["Scale"]                                = std::pair<int, op_load_t>(OP_SCALE,           load_scale);
-    op_load_map["Clip"]                                 = std::pair<int, op_load_t>(OP_CLIP,            load_clip);
-    op_load_map["InnerProduct"]                         = std::pair<int, op_load_t>(OP_FC,              load_fc);
+    op_load_map["Convolution"] = std::pair<int, op_load_t>(OP_CONV, load_conv);
+    op_load_map["ConvolutionDepthWise"] = std::pair<int, op_load_t>(OP_CONV, load_conv);
+    op_load_map["Pooling"] = std::pair<int, op_load_t>(OP_POOL, load_pool);
+    op_load_map["ReLU"] = std::pair<int, op_load_t>(OP_RELU, load_relu);
+    op_load_map["Concat"] = std::pair<int, op_load_t>(OP_CONCAT, load_concat);
+    op_load_map["Softmax"] = std::pair<int, op_load_t>(OP_SOFTMAX, load_softmax);
+    op_load_map["Dropout"] = std::pair<int, op_load_t>(OP_DROPOUT, load_no_param);
+    op_load_map["BatchNorm"] = std::pair<int, op_load_t>(OP_BATCHNORM, load_bn);
+    op_load_map["Scale"] = std::pair<int, op_load_t>(OP_SCALE, load_scale);
+    op_load_map["Clip"] = std::pair<int, op_load_t>(OP_CLIP, load_clip);
+    op_load_map["InnerProduct"] = std::pair<int, op_load_t>(OP_FC, load_fc);
     // op_load_map["PriorBox"]                          = std::pair<int, op_load_t>();
-    op_load_map["Flatten"]                              = std::pair<int, op_load_t>(OP_FLATTEN,         load_flatten);
-    op_load_map["Reshape"]                              = std::pair<int, op_load_t>(OP_RESHAPE,         load_reshape);
-    op_load_map["Eltwise"]                              = std::pair<int, op_load_t>(OP_ELTWISE,         load_eltwise);
-    op_load_map["Interp"]                               = std::pair<int, op_load_t>(OP_INTERP,          load_resize);
-    op_load_map["Slice"]                                = std::pair<int, op_load_t>(OP_SLICE,           load_slice);
-    op_load_map["Sigmoid"]                              = std::pair<int, op_load_t>(OP_SIGMOID,         load_no_param);
-    op_load_map["UnaryOp"]                              = std::pair<int, op_load_t>(OP_UNARY,           load_unary);
-    op_load_map["DeconvolutionDepthWise"]               = std::pair<int, op_load_t>(OP_DECONV,          load_deconv);
+    op_load_map["Flatten"] = std::pair<int, op_load_t>(OP_FLATTEN, load_flatten);
+    op_load_map["Reshape"] = std::pair<int, op_load_t>(OP_RESHAPE, load_reshape);
+    op_load_map["Eltwise"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["Interp"] = std::pair<int, op_load_t>(OP_INTERP, load_resize);
+    op_load_map["Slice"] = std::pair<int, op_load_t>(OP_SLICE, load_slice);
+    op_load_map["Sigmoid"] = std::pair<int, op_load_t>(OP_SIGMOID, load_no_param);
+    op_load_map["UnaryOp"] = std::pair<int, op_load_t>(OP_UNARY, load_unary);
+    op_load_map["DeconvolutionDepthWise"] = std::pair<int, op_load_t>(OP_DECONV, load_deconv);
 }
 /*
 *   OPERAOTR REGISTER FUNCTION DEFINE FOR NCNN SERIALIZER END
diff --git a/tools/convert_tool/ncnn/ncnn2tengine.hpp b/tools/convert_tool/ncnn/ncnn2tengine.hpp
index d55149a21..e4f06baaf 100644
--- a/tools/convert_tool/ncnn/ncnn2tengine.hpp
+++ b/tools/convert_tool/ncnn/ncnn2tengine.hpp
@@ -36,19 +36,18 @@
 #include <set>
 #include <algorithm>
 
-extern "C" 
-{
-    #include "tengine/c_api.h"
-    #include "graph/graph.h"
-    #include "graph/subgraph.h"
-    #include "graph/node.h"
-    #include "graph/tensor.h"
-    #include "executer/executer.h"
-    #include "module/module.h"
-    #include "utility/log.h"
-    #include "utility/sys_port.h"
-    #include "utility/vector.h"
-    #include "../utils/save_graph/op_include.h"
+extern "C" {
+#include "tengine/c_api.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "executer/executer.h"
+#include "module/module.h"
+#include "utility/log.h"
+#include "utility/sys_port.h"
+#include "utility/vector.h"
+#include "../utils/save_graph/op_include.h"
 }
 #define NCNN_MAX_PARAM_COUNT 32
 
@@ -58,7 +57,7 @@ struct NcnnNode
     std::string name;
     int optimized;
     std::map<int, std::string> attrs;
-    std::map<int, std::vector<std::string>> opt_attrs;
+    std::map<int, std::vector<std::string> > opt_attrs;
     //std::vector<int> inputs;
     std::vector<std::string> inputs_name;
     std::vector<std::string> output_name;
@@ -81,13 +80,13 @@ class ncnn_serializer
     typedef std::map<int, std::string>::const_iterator const_iterator;
 
 private:
-    std::unordered_map<std::string, std::pair<int, op_load_t>> op_load_map;
+    std::unordered_map<std::string, std::pair<int, op_load_t> > op_load_map;
     int load_model(ir_graph_t* graph, std::string params_file, std::string bin_file);
     int set_graph_input(ir_graph_t* graph, const std::vector<NcnnNode>& nodelist, const std::vector<NcnnParam>& paramlist);
     int load_constant_tensor(ir_graph_t* graph, const std::vector<NcnnNode>& nodelist, const std::vector<NcnnParam>& paramlist);
     int load_binary_file(const char* fname, std::vector<NcnnParam>& paramlist, std::vector<NcnnNode>& nodelist);
     int load_model_file(const char* fname, std::vector<NcnnNode>& nodelist);
-    int load_graph_node(ir_graph_t* graph,const std::vector<NcnnNode>& nodelist,  const std::vector<NcnnParam>& paramlist);
+    int load_graph_node(ir_graph_t* graph, const std::vector<NcnnNode>& nodelist, const std::vector<NcnnParam>& paramlist);
     bool find_op_load_method(const std::string& op_name);
     int read(void* buf, int size);
     ir_tensor_t* find_tensor(ir_graph_t* graph, const std::string& tensor_name);
@@ -98,12 +97,16 @@ class ncnn_serializer
     struct
     {
         int loaded;
-        union { int i; float f; };
+        union
+        {
+            int i;
+            float f;
+        };
         float* f_data;
         int* i_data;
         float* f_data_array;
         int* i_data_array;
-    } params[NCNN_MAX_PARAM_COUNT];  
+    } params[NCNN_MAX_PARAM_COUNT];
 };
 
 #endif
\ No newline at end of file
diff --git a/tools/convert_tool/onnx/onnx2tengine.cpp b/tools/convert_tool/onnx/onnx2tengine.cpp
index 5c70f6f2b..aa152df69 100644
--- a/tools/convert_tool/onnx/onnx2tengine.cpp
+++ b/tools/convert_tool/onnx/onnx2tengine.cpp
@@ -25,12 +25,11 @@
 
 #include "onnx2tengine.hpp"
 
-
 /*
 *   SELF DEFINE VARIABLE
 *   FOR ONNX SERIALIZER
 */
-const int OP_VERSION=1;
+const int OP_VERSION = 1;
 static int op_set;
 
 /*
@@ -38,7 +37,7 @@ static int op_set;
 */
 bool onnx_serializer::find_op_load_method(const std::string& op_name)
 {
-    if(op_load_map.count(op_name))
+    if (op_load_map.count(op_name))
         return true;
 
     return false;
@@ -52,9 +51,9 @@ ir_tensor_t* find_tensor(ir_graph_t* graph, const std::string& tensor_name)
         if (tensor->name == tensor_name)
         {
             return tensor;
-        }    
+        }
     }
-    
+
     return nullptr;
 }
 
@@ -76,29 +75,29 @@ const int get_onnx_tensor_data_type(const onnx::TensorProto& onnx_tensor)
     int tensor_data_type = -1;
     switch (onnx_tensor.data_type())
     {
-        case 1:
-            tensor_data_type = TENGINE_DT_FP32;
-            break;
-        case 2:
-            tensor_data_type = TENGINE_DT_UINT8;
-            break;
-        case 3:
-            tensor_data_type = TENGINE_DT_INT8;
-            break;
-        case 5:
-            tensor_data_type = TENGINE_DT_INT16;
-            break;
-        case 6: // int 32
-        case 7: // int 64
-            tensor_data_type = TENGINE_DT_INT32;
-            break;
-        case 10:
-            tensor_data_type = TENGINE_DT_FP16;
-            break;
-        
-        default:
-            fprintf(stderr, "tensor: %s. data type unsupported in get data type: %d.\n", onnx_tensor.name().c_str(), onnx_tensor.data_type());
-            return -1;
+    case 1:
+        tensor_data_type = TENGINE_DT_FP32;
+        break;
+    case 2:
+        tensor_data_type = TENGINE_DT_UINT8;
+        break;
+    case 3:
+        tensor_data_type = TENGINE_DT_INT8;
+        break;
+    case 5:
+        tensor_data_type = TENGINE_DT_INT16;
+        break;
+    case 6: // int 32
+    case 7: // int 64
+        tensor_data_type = TENGINE_DT_INT32;
+        break;
+    case 10:
+        tensor_data_type = TENGINE_DT_FP16;
+        break;
+
+    default:
+        fprintf(stderr, "tensor: %s. data type unsupported in get data type: %d.\n", onnx_tensor.name().c_str(), onnx_tensor.data_type());
+        return -1;
     }
 
     return tensor_data_type;
@@ -122,11 +121,11 @@ onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto& node, const char*
 *   ASSIST FUNCTIONS FOR ONNX SERIALIZER END
 */
 
-int onnx_serializer::load_model_file(std::string model_file, onnx::ModelProto &model)
+int onnx_serializer::load_model_file(std::string model_file, onnx::ModelProto& model)
 {
     std::ifstream is(model_file, std::ios::in | std::ios::binary);
 
-    if(!is.is_open())
+    if (!is.is_open())
     {
         fprintf(stderr, "cannot open file: %s \n", model_file.c_str());
         return -1;
@@ -145,7 +144,7 @@ int onnx_serializer::load_model_file(std::string model_file, onnx::ModelProto &m
 
     is.close();
 
-    if(!ret)
+    if (!ret)
     {
         fprintf(stderr, "onnx serializer: parse file: %s \n", model_file.c_str());
         return -1;
@@ -190,14 +189,12 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr
     }
     for (int i = 0; i < node_count; i++)
     {
-        
         const onnx::NodeProto& node = onnx_graph.node(i);
 
         const std::string& op = node.op_type();
 
-        
-        if ((op == "Reshape" || op == "Gather" || op == "Div" || op == "Resize")  )
-        {            
+        if ((op == "Reshape" || op == "Gather" || op == "Div" || op == "Resize"))
+        {
             const onnx::TensorProto& onnx_tensor = node_tensor[node.input(1)];
             std::pair<std::string, bool> t(node.input(1), 0);
             tensor_check.insert(t);
@@ -206,10 +203,10 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr
             {
                 return -1;
             }
-            
+
             const char* name = node.input(1).c_str();
             int dim_num = onnx_tensor.dims_size();
-            int *dims = new int[dim_num];
+            int* dims = new int[dim_num];
             for (int j = 0; j < dim_num; j++)
             {
                 dims[j] = onnx_tensor.dims(j);
@@ -225,19 +222,19 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr
             set_ir_tensor_shape(ir_tensor, dims, dim_num);
             ir_tensor->tensor_type = TENSOR_TYPE_CONST;
             // set tensor data
-            if ( 7 == onnx_tensor.data_type())
+            if (7 == onnx_tensor.data_type())
             {
-                int tensor_size = ir_tensor->elem_num *  sizeof(int64_t);
+                int tensor_size = ir_tensor->elem_num * sizeof(int64_t);
                 ir_tensor->data = sys_malloc(tensor_size);
                 int64_t* mem_buf = (int64_t*)ir_tensor->data;
-                if(onnx_tensor.has_raw_data())
+                if (onnx_tensor.has_raw_data())
                 {
                     int64_t* raw_data = (int64_t*)onnx_tensor.raw_data().data();
                     for (int j = 0; j < ir_tensor->elem_num; j++)
                     {
                         mem_buf[j] = raw_data[j];
                     }
-                } 
+                }
                 else
                 {
                     int64_t* raw_data = (int64_t*)onnx_tensor.int64_data().data();
@@ -249,10 +246,10 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr
             }
             else
             {
-                int tensor_size = ir_tensor->elem_num *  sizeof(uint8_t);
+                int tensor_size = ir_tensor->elem_num * sizeof(uint8_t);
                 ir_tensor->data = sys_malloc(tensor_size);
                 uint8_t* mem_buf = (uint8_t*)ir_tensor->data;
-                if(onnx_tensor.has_raw_data())
+                if (onnx_tensor.has_raw_data())
                 {
                     uint8_t* raw_data = (uint8_t*)onnx_tensor.raw_data().data();
                     for (int j = 0; j < ir_tensor->elem_num; j++)
@@ -272,9 +269,8 @@ int onnx_serializer::load_constant_tensor(ir_graph_t* graph, const onnx::GraphPr
             ir_node_t* ir_node = create_ir_node(graph, name, OP_CONST, OP_VERSION);
             set_ir_node_output_tensor(ir_node, 0, ir_tensor);
         }
-        
     }
-    
+
     return 0;
 }
 
@@ -284,7 +280,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap
     for (int i = 0; i < const_tensor_num; i++)
     {
         const onnx::TensorProto& onnx_tensor = onnx_graph.initializer(i);
-        
+
         if (onnx_tensor.data_type() != 1 && onnx_tensor.data_type() != 6 && onnx_tensor.data_type() != 7) // fp32 int32 int64
         {
             fprintf(stderr, "const tensor data type is not fp32 or int32 or int64. \n");
@@ -300,7 +296,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap
         }
         const char* name = onnx_tensor.name().c_str();
         int dim_num = onnx_tensor.dims_size();
-        int *dims = new int[dim_num];
+        int* dims = new int[dim_num];
         for (int j = 0; j < dim_num; j++)
         {
             dims[j] = onnx_tensor.dims(j);
@@ -320,12 +316,12 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap
             ir_tensor->dim_num = 1;
             ir_tensor->dims[0] = 1;
         }
-        
+
         if (onnx_tensor.has_raw_data())
         {
             if (onnx_tensor.data_type() == 1) //fp32
             {
-                int tensor_size = ir_tensor->elem_num *  sizeof(float);
+                int tensor_size = ir_tensor->elem_num * sizeof(float);
                 ir_tensor->data = sys_malloc(tensor_size);
                 float* mem_buf = (float*)ir_tensor->data;
                 float* raw_data = (float*)onnx_tensor.raw_data().c_str();
@@ -336,7 +332,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap
             }
             else if (onnx_tensor.data_type() == 6) // int32
             {
-                int tensor_size = ir_tensor->elem_num *  sizeof(int32_t);
+                int tensor_size = ir_tensor->elem_num * sizeof(int32_t);
                 ir_tensor->data = sys_malloc(tensor_size);
                 int32_t* mem_buf = (int32_t*)ir_tensor->data;
                 int32_t* raw_data = (int32_t*)onnx_tensor.raw_data().data();
@@ -347,7 +343,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap
             }
             else if (onnx_tensor.data_type() == 7) // int64
             {
-                int tensor_size = ir_tensor->elem_num *  sizeof(int64_t);
+                int tensor_size = ir_tensor->elem_num * sizeof(int64_t);
                 ir_tensor->data = sys_malloc(tensor_size);
                 int64_t* mem_buf = (int64_t*)ir_tensor->data;
                 int64_t* raw_data = (int64_t*)onnx_tensor.raw_data().data();
@@ -403,7 +399,7 @@ int onnx_serializer::load_initializer_tensor(ir_graph_t* graph, const onnx::Grap
                 return -1;
             }
         }
-        
+
         ir_node_t* ir_node = create_ir_node(graph, name, OP_CONST, OP_VERSION);
         set_ir_node_output_tensor(ir_node, 0, ir_tensor);
     }
@@ -416,7 +412,7 @@ int onnx_serializer::set_graph_input(ir_graph_t* graph, const onnx::GraphProto&
     for (int i = 0; i < onnx_graph.input_size(); i++)
     {
         const onnx::ValueInfoProto& val = onnx_graph.input(i);
-        if(get_ir_tensor_index_from_name(graph, val.name().c_str()) != -1)
+        if (get_ir_tensor_index_from_name(graph, val.name().c_str()) != -1)
             continue;
 
         // now, catch an input tensor
@@ -424,11 +420,11 @@ int onnx_serializer::set_graph_input(ir_graph_t* graph, const onnx::GraphProto&
         const onnx::TypeProto::Tensor& tensor_type = type.tensor_type();
         const onnx::TensorShapeProto& shape = tensor_type.shape();
         int has_shape = 1;
-        int *dims = new int[shape.dim_size()];
-        for(int j = 0; j < shape.dim_size(); j++)
+        int* dims = new int[shape.dim_size()];
+        for (int j = 0; j < shape.dim_size(); j++)
         {
             const onnx::TensorShapeProto::Dimension& dim = shape.dim(j);
-            if(dim.has_dim_param())
+            if (dim.has_dim_param())
             {
                 has_shape = 0;
                 break;
@@ -460,26 +456,26 @@ int onnx_serializer::load_graph_node(ir_graph_t* graph, const onnx::GraphProto&
 {
     int i;
     std::vector<std::string> no_supported_op;
-    for(i = 0; i < onnx_graph.node_size(); i++)
+    for (i = 0; i < onnx_graph.node_size(); i++)
     {
         const onnx::NodeProto& onnx_node = onnx_graph.node(i);
         const std::string& onnx_op_name = onnx_node.op_type();
 
-        if(!find_op_load_method(onnx_op_name))
+        if (!find_op_load_method(onnx_op_name))
         {
-            auto it = find(no_supported_op.begin(),no_supported_op.end(),onnx_op_name);
-            if(it == no_supported_op.end())
+            auto it = find(no_supported_op.begin(), no_supported_op.end(), onnx_op_name);
+            if (it == no_supported_op.end())
             {
-                if(onnx_op_name == "Constant")
+                if (onnx_op_name == "Constant")
                     continue;
                 no_supported_op.push_back(onnx_op_name);
             }
         }
     }
-    if(no_supported_op.size())
+    if (no_supported_op.size())
     {
         fprintf(stderr, "These %zu op are not supported\n{ ", no_supported_op.size());
-        for(int j = 0; j < (int) no_supported_op.size(); j++)
+        for (int j = 0; j < (int)no_supported_op.size(); j++)
         {
             fprintf(stderr, "%s ", no_supported_op[j].c_str());
         }
@@ -487,7 +483,7 @@ int onnx_serializer::load_graph_node(ir_graph_t* graph, const onnx::GraphProto&
         return -1;
     }
 
-    for(i = 0; i < onnx_graph.node_size(); i++)
+    for (i = 0; i < onnx_graph.node_size(); i++)
     {
         /* create ir node*/
         const onnx::NodeProto& onnx_node = onnx_graph.node(i);
@@ -515,7 +511,7 @@ int onnx_serializer::load_graph_node(ir_graph_t* graph, const onnx::GraphProto&
                 continue;
             }
             int tensor_id = get_ir_tensor_index_from_name(graph, input_name.c_str());
-            ir_tensor_t* tensor = get_ir_graph_tensor(graph, tensor_id);        
+            ir_tensor_t* tensor = get_ir_graph_tensor(graph, tensor_id);
             tensor_check[tensor->name] = tensor_check[tensor->name] + 1;
             set_ir_node_input_tensor(ir_node, j, tensor);
         }
@@ -549,16 +545,15 @@ int onnx_serializer::set_graph_output(ir_graph_t* graph, const onnx::GraphProto&
         const onnx::ValueInfoProto& val = onnx_graph.output(i);
         int tensor_id = get_ir_tensor_index_from_name(graph, val.name().c_str());
 
-
         const onnx::TypeProto& type = val.type();
         const onnx::TypeProto::Tensor& tensor_type = type.tensor_type();
         const onnx::TensorShapeProto& shape = tensor_type.shape();
         int has_shape = 1;
-        int *dims = new int[shape.dim_size()];
-        for(int j = 0; j < shape.dim_size(); j++)
+        int* dims = new int[shape.dim_size()];
+        for (int j = 0; j < shape.dim_size(); j++)
         {
             const onnx::TensorShapeProto::Dimension& dim = shape.dim(j);
-            if(dim.has_dim_param())
+            if (dim.has_dim_param())
             {
                 has_shape = 0;
                 break;
@@ -633,7 +628,7 @@ graph_t onnx_serializer::onnx2tengine(std::string model_file)
 
 int load_conv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct conv_param* conv_param = ( struct conv_param* )node->op.param_mem;
+    struct conv_param* conv_param = (struct conv_param*)node->op.param_mem;
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
@@ -697,45 +692,45 @@ int load_conv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
 
 int load_relu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct relu_param* relu_param = ( struct relu_param* )node->op.param_mem;
+    struct relu_param* relu_param = (struct relu_param*)node->op.param_mem;
     relu_param->negative_slope = 0.f;
     return 0;
 }
 
 int load_pool(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct pool_param* pool_param = ( struct pool_param* )node->op.param_mem;
+    struct pool_param* pool_param = (struct pool_param*)node->op.param_mem;
     const std::string& onnx_op = onnx_node.op_type();
 
-    if(onnx_op == "GlobalAveragePool")
+    if (onnx_op == "GlobalAveragePool")
     {
         pool_param->global = 1;
         pool_param->pool_method = POOL_AVG;
     }
-    else if(onnx_op == "MaxPool" || onnx_op == "AveragePool")
+    else if (onnx_op == "MaxPool" || onnx_op == "AveragePool")
     {
         pool_param->global = 0;
 
-        if(onnx_op == "AveragePool")
+        if (onnx_op == "AveragePool")
             pool_param->pool_method = POOL_AVG;
         else
             pool_param->pool_method = POOL_MAX;
 
-        for(int k = 0; k < onnx_node.attribute_size(); k++)
+        for (int k = 0; k < onnx_node.attribute_size(); k++)
         {
             const onnx::AttributeProto& attr = onnx_node.attribute(k);
 
-            if(attr.name() == "kernel_shape")
+            if (attr.name() == "kernel_shape")
             {
                 pool_param->kernel_h = attr.ints(0);
                 pool_param->kernel_w = attr.ints(1);
             }
-            else if(attr.name() == "strides")
+            else if (attr.name() == "strides")
             {
                 pool_param->stride_h = attr.ints(0);
                 pool_param->stride_w = attr.ints(1);
             }
-            else if(attr.name() == "pads") /* onnx pads: x0_begin, x1_begin, ... , x0_end, x1_end, ... */
+            else if (attr.name() == "pads") /* onnx pads: x0_begin, x1_begin, ... , x0_end, x1_end, ... */
             {
                 pool_param->pad_h0 = attr.ints(0);
                 pool_param->pad_h1 = attr.ints(2);
@@ -758,7 +753,7 @@ int load_pool(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
 
 int load_flatten(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct flatten_param* flatten_param = ( struct flatten_param* )node->op.param_mem;
+    struct flatten_param* flatten_param = (struct flatten_param*)node->op.param_mem;
     flatten_param->axis = 1;
 
     if (1 == onnx_node.attribute_size())
@@ -771,7 +766,7 @@ int load_flatten(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx
 
 int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct gemm_param* gemm_param = ( struct gemm_param* )node->op.param_mem;
+    struct gemm_param* gemm_param = (struct gemm_param*)node->op.param_mem;
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
@@ -813,7 +808,7 @@ int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
 
         // float* tmp = ( float* )sys_malloc(k * n * sizeof(float));
         std::vector<float> tmp(k * n);
-        float* data = ( float* )weight_tensor->data;
+        float* data = (float*)weight_tensor->data;
         for (int i = 0; i < n; i++)
             for (int j = 0; j < k; j++)
             {
@@ -826,7 +821,7 @@ int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
 
     if (gemm_param->alpha != 1)
     {
-        float* data = ( float* )weight_tensor->data;
+        float* data = (float*)weight_tensor->data;
         int tensor_size = weight_tensor->dims[0] * weight_tensor->dims[1];
 
         for (int i = 0; i < tensor_size; i++)
@@ -835,7 +830,7 @@ int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
 
     if (gemm_param->beta != 1)
     {
-        float* data = ( float* )bias_tensor->data;
+        float* data = (float*)bias_tensor->data;
         int tensor_size = weight_tensor->dims[0];
 
         for (int i = 0; i < tensor_size; i++)
@@ -848,13 +843,13 @@ int load_gemm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
     }
     struct fc_param* fc_param = (struct fc_param*)node->op.param_mem;
     fc_param->num_output = weight_tensor->dims[0];
-    
+
     return 0;
 }
 
 int load_concat(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct concat_param* concat_param = ( struct concat_param* )node->op.param_mem;
+    struct concat_param* concat_param = (struct concat_param*)node->op.param_mem;
 
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
@@ -870,7 +865,7 @@ int load_concat(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
 
 int load_bn(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )node->op.param_mem;
+    struct batchnorm_param* batchnorm_param = (struct batchnorm_param*)node->op.param_mem;
 
     // get espilon
     for (int k = 0; k < onnx_node.attribute_size(); k++)
@@ -888,39 +883,39 @@ int load_bn(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node
 
 int load_eltwise(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct eltwise_param* eltwise_param = ( struct eltwise_param* )node->op.param_mem;
+    struct eltwise_param* eltwise_param = (struct eltwise_param*)node->op.param_mem;
     const std::string& op_name = onnx_node.op_type();
     if (op_name == "Add")
     {
-        eltwise_param->type = ELT_SUM;  
+        eltwise_param->type = ELT_SUM;
     }
     else if (op_name == "Mul")
     {
-        eltwise_param->type = ELT_PROD; 
+        eltwise_param->type = ELT_PROD;
     }
     else if (op_name == "Div")
     {
-        eltwise_param->type = ELT_DIV;  
+        eltwise_param->type = ELT_DIV;
     }
     else if (op_name == "Floor")
     {
-        eltwise_param->type = ELT_FLOOR;    
+        eltwise_param->type = ELT_FLOOR;
     }
     else if (op_name == "Exp")
     {
-        eltwise_param->type = ELT_EXP;  
+        eltwise_param->type = ELT_EXP;
     }
     else if (op_name == "Sub")
     {
-        eltwise_param->type = ELT_SUB;  
+        eltwise_param->type = ELT_SUB;
     }
     else if (op_name == "Pow")
     {
-        eltwise_param->type = ELT_POW;  
+        eltwise_param->type = ELT_POW;
     }
     else if (op_name == "Sqrt")
     {
-        eltwise_param->type = ELT_SQRT; 
+        eltwise_param->type = ELT_SQRT;
     }
 
     return 0;
@@ -928,8 +923,8 @@ int load_eltwise(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx
 
 int load_transpose(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct transpose_param* transpose_param = ( struct transpose_param* )node->op.param_mem;
-    
+    struct transpose_param* transpose_param = (struct transpose_param*)node->op.param_mem;
+
     const onnx::AttributeProto& attr = onnx_node.attribute(0);
     int size = attr.ints_size();
     transpose_param->tr_shape = (int*)sys_malloc(sizeof(int) * size);
@@ -944,7 +939,7 @@ int load_transpose(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& on
 
 int load_clip(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct clip_param* clip_param = ( struct clip_param* )node->op.param_mem;
+    struct clip_param* clip_param = (struct clip_param*)node->op.param_mem;
 
     int size = onnx_node.attribute_size();
     for (int i = 0; i < size; i++)
@@ -975,7 +970,7 @@ int load_clip(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
 
 int load_reshape(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct reshape_param* reshape_param = ( struct reshape_param* )node->op.param_mem;
+    struct reshape_param* reshape_param = (struct reshape_param*)node->op.param_mem;
 
     ir_tensor_t* shape_tensor = find_tensor(graph, onnx_node.input(1));
     if (shape_tensor == nullptr)
@@ -990,7 +985,7 @@ int load_reshape(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx
 
     int64_t* data = (int64_t*)shape_tensor->data;
     for (int i = 0; i < size; i++)
-    { 
+    {
         reshape_param->re_shape[i] = data[i];
     }
     return 0;
@@ -1004,7 +999,7 @@ int load_no_param(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onn
 
 int load_softmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct softmax_param* softmax_param = ( struct softmax_param* )node->op.param_mem;
+    struct softmax_param* softmax_param = (struct softmax_param*)node->op.param_mem;
 
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
@@ -1026,7 +1021,7 @@ int load_softmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx
 
 int load_elu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct elu_param* elu_param = ( struct elu_param* )node->op.param_mem;
+    struct elu_param* elu_param = (struct elu_param*)node->op.param_mem;
 
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
@@ -1053,9 +1048,9 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
             mode = attr.s();
         }
     }
-    if(mode != "nearest")
+    if (mode != "nearest")
     {
-        struct interp_param* interp_param = ( struct interp_param* )node->op.param_mem;
+        struct interp_param* interp_param = (struct interp_param*)node->op.param_mem;
 
         if (onnx_node.input_size() == 1)
         {
@@ -1085,7 +1080,7 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
         {
             const std::string& input_name = onnx_node.input(1);
             ir_tensor_t* tensor = find_tensor(graph, input_name);
-            float* data = ( float* )tensor->data;
+            float* data = (float*)tensor->data;
 
             interp_param->height_scale = data[2];
             interp_param->width_scale = data[3];
@@ -1098,7 +1093,7 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
         {
             interp_param->resize_type = 2;
         }
-    } 
+    }
     else
     {
         if (change_node_op(node, OP_RESIZE) < 0)
@@ -1111,7 +1106,7 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
         {
             const std::string& input_name = onnx_node.input(1);
             ir_tensor_t* tensor = find_tensor(graph, input_name);
-            float* data = ( float* )tensor->data;
+            float* data = (float*)tensor->data;
             resize_param->scale_h = data[2];
             resize_param->scale_w = data[3];
         }
@@ -1127,7 +1122,7 @@ int load_interp(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
 
 int load_leaky_relu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct relu_param* relu_param = ( struct relu_param* )node->op.param_mem;
+    struct relu_param* relu_param = (struct relu_param*)node->op.param_mem;
     const onnx::AttributeProto& attr = onnx_node.attribute(0);
     relu_param->negative_slope = attr.f();
 
@@ -1136,7 +1131,7 @@ int load_leaky_relu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& o
 
 int load_slice(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct slice_param* slice_param = ( struct slice_param* )node->op.param_mem;
+    struct slice_param* slice_param = (struct slice_param*)node->op.param_mem;
 
     slice_param->step = 1;
     slice_param->axis = 0;
@@ -1162,7 +1157,7 @@ int load_slice(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_n
                 {
                     end = INT_MAX;
                 }
-                slice_param->end = ( int )end;
+                slice_param->end = (int)end;
             }
             else if (attr.name() == "starts")
             {
@@ -1200,7 +1195,7 @@ int load_slice(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_n
 
 int load_split(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct split_param* split_param = ( struct split_param* )node->op.param_mem;
+    struct split_param* split_param = (struct split_param*)node->op.param_mem;
     split_param->is_onnx = true;
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
@@ -1229,7 +1224,7 @@ int load_split(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_n
 
 int load_unsqueeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct unsqueeze_param* unsqueeze_param = ( struct unsqueeze_param* )node->op.param_mem;
+    struct unsqueeze_param* unsqueeze_param = (struct unsqueeze_param*)node->op.param_mem;
 
     std::vector<int> axises;
     for (int k = 0; k < onnx_node.attribute_size(); k++)
@@ -1248,12 +1243,12 @@ int load_unsqueeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& on
     if (axises.empty() && node->input_num == 2)
     {
         ir_tensor_t* axes_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
-        int* data = ( int* )axes_tensor->data;
+        int* data = (int*)axes_tensor->data;
         for (int i = 0; i < axes_tensor->elem_num; i++)
         {
             axises.push_back(data[i]);
         }
-        
+
         // remove axes tensor
         node->input_num = 1;
     }
@@ -1265,13 +1260,13 @@ int load_unsqueeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& on
     {
         unsqueeze_param->axises[i] = axises[i];
     }
-    
+
     return 0;
 }
 
 int load_squeeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct squeeze_param* squeeze_param = ( struct squeeze_param* )node->op.param_mem;
+    struct squeeze_param* squeeze_param = (struct squeeze_param*)node->op.param_mem;
 
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
@@ -1299,7 +1294,7 @@ int load_squeeze(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx
             }
         }
     }
-    
+
     return 0;
 }
 
@@ -1308,7 +1303,7 @@ int load_matmul(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
     ir_tensor_t* input_tensor = find_tensor(graph, onnx_node.input(0));
     ir_tensor_t* weight_tensor = find_tensor(graph, onnx_node.input(1));
 
-    if(2 == input_tensor->dim_num && weight_tensor->tensor_type == TENSOR_TYPE_CONST)
+    if (2 == input_tensor->dim_num && weight_tensor->tensor_type == TENSOR_TYPE_CONST)
     {
         // swap shape
         int k = weight_tensor->dims[0];
@@ -1319,7 +1314,7 @@ int load_matmul(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
 
         // float* tmp = ( float* )sys_malloc(k * n * sizeof(float));
         std::vector<float> tmp(k * n);
-        float* data = ( float* )weight_tensor->data;
+        float* data = (float*)weight_tensor->data;
 
         for (int i = 0; i < n; i++)
         {
@@ -1335,36 +1330,36 @@ int load_matmul(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
         {
             return -1;
         }
-        struct fc_param* fc_param = ( struct fc_param* )node->op.param_mem;
+        struct fc_param* fc_param = (struct fc_param*)node->op.param_mem;
         fc_param->num_output = weight_tensor->dims[0];
     }
-    
+
     return 0;
 }
 
 int load_reducel2(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct reducel2_param* reducel2_param = ( struct reducel2_param* )node->op.param_mem;
+    struct reducel2_param* reducel2_param = (struct reducel2_param*)node->op.param_mem;
 
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
         if (attr.name() == "axes")
         {
-            reducel2_param->axis = attr.ints(0);    // TODO:Support muti axis
+            reducel2_param->axis = attr.ints(0); // TODO:Support muti axis
         }
         if (attr.name() == "keepdims")
         {
             reducel2_param->keepdim = attr.i();
         }
     }
-    
+
     return 0;
 }
 
 int load_gather(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct gather_param* gather_param = ( struct gather_param* )node->op.param_mem;
+    struct gather_param* gather_param = (struct gather_param*)node->op.param_mem;
 
     ir_tensor_t* indices_tensor = find_tensor(graph, onnx_node.input(1));
     for (int k = 0; k < onnx_node.attribute_size(); k++)
@@ -1375,16 +1370,16 @@ int load_gather(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
             gather_param->axis = attr.i();
         }
     }
-    int64_t* data = ( int64_t* )indices_tensor->data;
+    int64_t* data = (int64_t*)indices_tensor->data;
     gather_param->indices_num = *data;
     gather_param->is_onnx = 1;
-    
+
     return 0;
 }
 
 int load_comparison(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct comparison_param* comparison_param = ( struct comparison_param* )node->op.param_mem;
+    struct comparison_param* comparison_param = (struct comparison_param*)node->op.param_mem;
     const std::string& op_name = onnx_node.op_type();
 
     if (op_name == "Greater")
@@ -1405,13 +1400,13 @@ int load_comparison(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& o
 
 int load_LRN(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct lrn_param* lrn_param = ( struct lrn_param* )node->op.param_mem;
+    struct lrn_param* lrn_param = (struct lrn_param*)node->op.param_mem;
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
         if (attr.name() == "alpha")
         {
-            lrn_param->alpha = attr.f();    // TODO:Support multi axis
+            lrn_param->alpha = attr.f(); // TODO:Support multi axis
         }
         if (attr.name() == "beta")
         {
@@ -1426,13 +1421,13 @@ int load_LRN(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_nod
             lrn_param->local_size = attr.i();
         }
     }
-    
+
     return 0;
 }
 
 int load_unary(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct unary_param* unary_param = ( struct unary_param* )node->op.param_mem;
+    struct unary_param* unary_param = (struct unary_param*)node->op.param_mem;
     const std::string& op_name = onnx_node.op_type();
 
     if (op_name == "Abs")
@@ -1467,13 +1462,13 @@ int load_unary(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_n
     {
         unary_param->type = 14;
     }
-    
+
     return 0;
 }
 
 int load_logical(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct logical_param* logical_param = ( struct logical_param* )node->op.param_mem;
+    struct logical_param* logical_param = (struct logical_param*)node->op.param_mem;
     const std::string& op_name = onnx_node.op_type();
 
     if (op_name == "And")
@@ -1484,19 +1479,19 @@ int load_logical(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx
     {
         logical_param->type = 1;
     }
-    
+
     return 0;
 }
 
 int load_pad(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct pad_param* pad_param = ( struct pad_param* )node->op.param_mem;
-    
-    if (onnx_node.attribute_size() == 1)  // since opset 11, 'pads' and 'value' have been moved from attributes to inputs
+    struct pad_param* pad_param = (struct pad_param*)node->op.param_mem;
+
+    if (onnx_node.attribute_size() == 1) // since opset 11, 'pads' and 'value' have been moved from attributes to inputs
     {
         const std::string& input_name_pad = onnx_node.input(1);
         ir_tensor_t* tensor_pad = find_tensor(graph, input_name_pad);
-        int64_t* data_pad = ( int64_t * )tensor_pad->data;
+        int64_t* data_pad = (int64_t*)tensor_pad->data;
         pad_param->pad_0_h = data_pad[0];
         pad_param->pad_0_w = data_pad[4];
         pad_param->pad_1_h = data_pad[1];
@@ -1510,12 +1505,11 @@ int load_pad(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_nod
         {
             const std::string& input_name_value = onnx_node.input(2);
             ir_tensor_t* tensor_value = find_tensor(graph, input_name_value);
-            float* data_value = ( float * )tensor_value->data;
+            float* data_value = (float*)tensor_value->data;
             pad_param->value = data_value[0];
         }
-
     }
-    
+
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
@@ -1550,20 +1544,21 @@ int load_pad(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_nod
             pad_param->value = attr.f();
         }
     }
-    if(onnx_node.input_size() > 1){
+    if (onnx_node.input_size() > 1)
+    {
         ir_tensor_t* shape_tensor = find_tensor(graph, onnx_node.input(1));
         int size = shape_tensor->dims[0];
-        int64_t* data = ( int64_t* )shape_tensor->data;
+        int64_t* data = (int64_t*)shape_tensor->data;
         for (int i = 0; i < size; i++)
         {
-                pad_param->pad_0_h = data[0];
-                pad_param->pad_0_w = data[4];
-                pad_param->pad_1_h = data[1];
-                pad_param->pad_1_w = data[5];
-                pad_param->pad_2_h = data[2];
-                pad_param->pad_2_w = data[6];
-                pad_param->pad_3_h = data[3];
-                pad_param->pad_3_w = data[7];
+            pad_param->pad_0_h = data[0];
+            pad_param->pad_0_w = data[4];
+            pad_param->pad_1_h = data[1];
+            pad_param->pad_1_w = data[5];
+            pad_param->pad_2_h = data[2];
+            pad_param->pad_2_w = data[6];
+            pad_param->pad_3_h = data[3];
+            pad_param->pad_3_w = data[7];
         }
     }
     return 0;
@@ -1571,7 +1566,7 @@ int load_pad(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_nod
 
 int load_reduce(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct reduction_param* reduction_param = ( struct reduction_param* )node->op.param_mem;
+    struct reduction_param* reduction_param = (struct reduction_param*)node->op.param_mem;
     const std::string& op_name = onnx_node.op_type();
 
     if (op_name == "ReduceSum")
@@ -1612,7 +1607,7 @@ int load_reduce(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
     reduction_param->dim_2 = -2;
     reduction_param->dim_3 = -2;
     reduction_param->keepdim = 1;
-    
+
     ir_tensor_t* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
     int input_dim_num = input_tensor->dim_num;
     int size = onnx_node.attribute_size();
@@ -1705,8 +1700,8 @@ int load_reduce(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
 
 int load_argmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct argmax_param* argmax_param = ( struct argmax_param* )node->op.param_mem;
-    
+    struct argmax_param* argmax_param = (struct argmax_param*)node->op.param_mem;
+
     int size = onnx_node.attribute_size();
     argmax_param->axis = 0;
     for (int i = 0; i < size; i++)
@@ -1721,14 +1716,14 @@ int load_argmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
             argmax_param->keepdims = attr.i();
         }
     }
-    
+
     return 0;
 }
 
 int load_argmin(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct argmin_param* argmin_param = ( struct argmin_param* )node->op.param_mem;
-    
+    struct argmin_param* argmin_param = (struct argmin_param*)node->op.param_mem;
+
     int size = onnx_node.attribute_size();
     argmin_param->axis = 0;
     for (int i = 0; i < size; i++)
@@ -1743,14 +1738,14 @@ int load_argmin(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
             argmin_param->keepdims = attr.i();
         }
     }
-    
+
     return 0;
 }
 
 int load_log_softmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct logsoftmax_param* logsoftmax_param = ( struct logsoftmax_param* )node->op.param_mem;
-    
+    struct logsoftmax_param* logsoftmax_param = (struct logsoftmax_param*)node->op.param_mem;
+
     int size = onnx_node.attribute_size();
     logsoftmax_param->axis = 1;
     for (int i = 0; i < size; i++)
@@ -1761,14 +1756,14 @@ int load_log_softmax(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto&
             logsoftmax_param->axis = attr.i();
         }
     }
-    
+
     return 0;
 }
 
 int load_deconv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct deconv_param* deconv_param = ( struct deconv_param* )node->op.param_mem;
-    
+    struct deconv_param* deconv_param = (struct deconv_param*)node->op.param_mem;
+
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
@@ -1812,7 +1807,7 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
     {
         const std::string& input_name = onnx_node.input(k);
         ir_tensor_t* tensor = find_tensor(graph, input_name);
-        if (k == 1)    // weight
+        if (k == 1) // weight
         {
             int* dim = tensor->dims;
             /* onnx hide the output channel in weight ..*/
@@ -1821,14 +1816,14 @@ int load_deconv(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
             deconv_param->kernel_w = dim[3];
         }
     }
-    
+
     return 0;
 }
 
 int load_scatter(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct scatter_param* scatter_param = ( struct scatter_param* )node->op.param_mem;
-    
+    struct scatter_param* scatter_param = (struct scatter_param*)node->op.param_mem;
+
     int size = onnx_node.attribute_size();
     scatter_param->axis = 0;
     scatter_param->is_onnx = 1;
@@ -1840,14 +1835,14 @@ int load_scatter(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx
             scatter_param->axis = attr.i();
         }
     }
-    
+
     return 0;
 }
 
 int load_selu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct selu_param* selu_param = ( struct selu_param* )node->op.param_mem;
-    
+    struct selu_param* selu_param = (struct selu_param*)node->op.param_mem;
+
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
@@ -1860,14 +1855,14 @@ int load_selu(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
             selu_param->lambda = attr.f();
         }
     }
-    
+
     return 0;
 }
 
 int load_hard_sigmoid(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct hard_sigmoid_param* hard_sigmoid_param = ( struct hard_sigmoid_param* )node->op.param_mem;
-    
+    struct hard_sigmoid_param* hard_sigmoid_param = (struct hard_sigmoid_param*)node->op.param_mem;
+
     for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
@@ -1880,26 +1875,26 @@ int load_hard_sigmoid(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto&
             hard_sigmoid_param->beta = attr.f();
         }
     }
-    
+
     return 0;
 }
 
 int load_tile(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct tile_param* tile_param = ( struct tile_param* )node->op.param_mem;
+    struct tile_param* tile_param = (struct tile_param*)node->op.param_mem;
     tile_param->frame_flag = 1;
-    
+
     return 0;
 }
 
 int load_cast(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct cast_param* cast_param = ( struct cast_param* )node->op.param_mem;
+    struct cast_param* cast_param = (struct cast_param*)node->op.param_mem;
 
-    for(int k = 0; k < onnx_node.attribute_size(); k++)
+    for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
-        if(attr.name() == "to")
+        if (attr.name() == "to")
             cast_param->type_to = attr.i();
     }
     cast_param->type_from = 1;
@@ -1909,11 +1904,13 @@ int load_cast(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
 
 int load_depth_to_space(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct depthtospace_param* depthtospace_param = ( struct depthtospace_param* )node->op.param_mem;
+    struct depthtospace_param* depthtospace_param = (struct depthtospace_param*)node->op.param_mem;
 
-    for(int k = 0; k < onnx_node.attribute_size(); k++){
+    for (int k = 0; k < onnx_node.attribute_size(); k++)
+    {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
-        if(attr.name() == "block_size"){
+        if (attr.name() == "block_size")
+        {
             depthtospace_param->block_size = attr.i();
         }
     }
@@ -1923,12 +1920,12 @@ int load_depth_to_space(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProt
 
 int load_instance_norm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct instancenorm_Param* instancenorm_param = ( struct instancenorm_Param* )node->op.param_mem;
+    struct instancenorm_Param* instancenorm_param = (struct instancenorm_Param*)node->op.param_mem;
 
-    for(int k = 0; k < onnx_node.attribute_size(); k++)
+    for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
-        if(attr.name() == "epsilon")
+        if (attr.name() == "epsilon")
             instancenorm_param->eps = attr.f();
     }
 
@@ -1937,34 +1934,34 @@ int load_instance_norm(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto
 
 int load_resize(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct interp_param* interp_param = ( struct interp_param* )node->op.param_mem;
+    struct interp_param* interp_param = (struct interp_param*)node->op.param_mem;
 
-    if(onnx_node.input_size() == 1)
+    if (onnx_node.input_size() == 1)
     {
-        for(int k = 0; k < onnx_node.attribute_size(); k++)
+        for (int k = 0; k < onnx_node.attribute_size(); k++)
         {
             const onnx::AttributeProto& attr = onnx_node.attribute(k);
-            if(attr.name() == "scales")
+            if (attr.name() == "scales")
             {
                 interp_param->height_scale = attr.f();
                 interp_param->width_scale = attr.f();
             }
         }
     }
-    else if(onnx_node.input_size() == 2) // opset 10
+    else if (onnx_node.input_size() == 2) // opset 10
     {
         const std::string& input_name = onnx_node.input(1);
         ir_tensor_t* tensor = find_tensor(graph, input_name);
-        float* data = ( float* )tensor->data;
+        float* data = (float*)tensor->data;
 
         interp_param->height_scale = data[2];
         interp_param->width_scale = data[3];
     }
-    else if(onnx_node.input_size() == 3) // opset 11
+    else if (onnx_node.input_size() == 3) // opset 11
     {
         const std::string& input_name = onnx_node.input(2);
         ir_tensor_t* tensor = find_tensor(graph, input_name);
-        float* data = ( float* )tensor->data;
+        float* data = (float*)tensor->data;
 
         interp_param->height_scale = data[2];
         interp_param->width_scale = data[3];
@@ -1973,7 +1970,7 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
     {
         const std::string& input_name = onnx_node.input(3);
         ir_tensor_t* tensor = find_tensor(graph, input_name);
-        float* data = ( float* )tensor->data;
+        float* data = (float*)tensor->data;
 
         interp_param->height_scale = data[2];
         interp_param->width_scale = data[3];
@@ -1985,10 +1982,10 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
     }
 
     std::string mode = "nearest";
-    for(int k = 0; k < onnx_node.attribute_size(); k++)
+    for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
-        if(attr.name() == "mode")
+        if (attr.name() == "mode")
             mode = attr.s();
     }
 
@@ -2006,16 +2003,16 @@ int load_resize(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
 
 int load_LSTM(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct lstm_param* lstm_param = ( struct lstm_param* )node->op.param_mem;
+    struct lstm_param* lstm_param = (struct lstm_param*)node->op.param_mem;
 
     int s_size;
     std::string lstm_type;
-    for(int k = 0; k < onnx_node.attribute_size(); k++)
+    for (int k = 0; k < onnx_node.attribute_size(); k++)
     {
         const onnx::AttributeProto& attr = onnx_node.attribute(k);
-        if(attr.name() == "hidden_size")
+        if (attr.name() == "hidden_size")
             s_size = attr.i();
-        if(attr.name() == "direction")
+        if (attr.name() == "direction")
             lstm_type = attr.s();
     }
 
@@ -2028,7 +2025,7 @@ int load_LSTM(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_no
 
 int load_expand(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node)
 {
-    struct expand_param* expand_param = ( struct expand_param* )node->op.param_mem;
+    struct expand_param* expand_param = (struct expand_param*)node->op.param_mem;
 
     ir_tensor_t* shape_tensor = find_tensor(graph, onnx_node.input(1));
     if (shape_tensor == nullptr)
@@ -2047,92 +2044,91 @@ int load_expand(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_
     return 0;
 }
 
-
 /*
 *   OPERAOTR REGISTER FUNCTION DEFINE FOR ONNX SERIALIZER START
 */
 void onnx_serializer::register_op_load()
 {
-    op_load_map["Abs"]                   = std::pair<int, op_load_t>(OP_UNARY,        load_unary);
-    op_load_map["Acos"]                  = std::pair<int, op_load_t>(OP_UNARY,        load_unary);
-    op_load_map["And"]                   = std::pair<int, op_load_t>(OP_LOGICAL,      load_logical);
-    op_load_map["ArgMax"]                = std::pair<int, op_load_t>(OP_ARGMAX,       load_argmax);
-    op_load_map["ArgMin"]                = std::pair<int, op_load_t>(OP_ARGMIN,       load_argmin);
-    op_load_map["Asin"]                  = std::pair<int, op_load_t>(OP_UNARY,        load_unary);
-    op_load_map["Atan"]                  = std::pair<int, op_load_t>(OP_UNARY,        load_unary);
-    op_load_map["AveragePool"]           = std::pair<int, op_load_t>(OP_POOL,         load_pool);
-    op_load_map["Add"]                   = std::pair<int, op_load_t>(OP_ELTWISE,      load_eltwise);
-    op_load_map["BatchNormalization"]    = std::pair<int, op_load_t>(OP_BATCHNORM,    load_bn);
-    op_load_map["Conv"]                  = std::pair<int, op_load_t>(OP_CONV,         load_conv);
-    op_load_map["ConvTranspose"]         = std::pair<int, op_load_t>(OP_DECONV,       load_deconv);
-    op_load_map["Concat"]                = std::pair<int, op_load_t>(OP_CONCAT,       load_concat);
-    op_load_map["Clip"]                  = std::pair<int, op_load_t>(OP_CLIP,         load_clip);
-    op_load_map["Ceil"]                  = std::pair<int, op_load_t>(OP_UNARY,        load_unary);
-    op_load_map["Cos"]                   = std::pair<int, op_load_t>(OP_UNARY,        load_unary);
-    op_load_map["Cast"]                  = std::pair<int, op_load_t>(OP_CAST,         load_cast);
-    op_load_map["Dropout"]               = std::pair<int, op_load_t>(OP_DROPOUT,      load_no_param);
-    op_load_map["DepthToSpace"]          = std::pair<int, op_load_t>(OP_DEPTHTOSPACE, load_depth_to_space);
-    op_load_map["Div"]                   = std::pair<int, op_load_t>(OP_ELTWISE,      load_eltwise);
-    op_load_map["Elu"]                   = std::pair<int, op_load_t>(OP_ELU,          load_elu);
-    op_load_map["Exp"]                   = std::pair<int, op_load_t>(OP_ELTWISE,      load_eltwise);
-    op_load_map["Expand"]                = std::pair<int, op_load_t>(OP_EXPAND,       load_expand);
-    op_load_map["Equal"]                 = std::pair<int, op_load_t>(OP_COMPARISON,   load_comparison);
-    op_load_map["Flatten"]               = std::pair<int, op_load_t>(OP_FLATTEN,      load_flatten);
-    op_load_map["Floor"]                 = std::pair<int, op_load_t>(OP_ELTWISE,      load_eltwise);
-    op_load_map["Gemm"]                  = std::pair<int, op_load_t>(OP_GEMM,         load_gemm);
-    op_load_map["Gather"]                = std::pair<int, op_load_t>(OP_GATHER,       load_gather);
-    op_load_map["Greater"]               = std::pair<int, op_load_t>(OP_COMPARISON,   load_comparison);
-    op_load_map["GlobalAveragePool"]     = std::pair<int, op_load_t>(OP_POOL,         load_pool);
-    op_load_map["HardSwish"]             = std::pair<int, op_load_t>(OP_HARDSWISH,    load_no_param);
-    op_load_map["HardSigmoid"]           = std::pair<int, op_load_t>(OP_HARDSIGMOID,  load_hard_sigmoid);
+    op_load_map["Abs"] = std::pair<int, op_load_t>(OP_UNARY, load_unary);
+    op_load_map["Acos"] = std::pair<int, op_load_t>(OP_UNARY, load_unary);
+    op_load_map["And"] = std::pair<int, op_load_t>(OP_LOGICAL, load_logical);
+    op_load_map["ArgMax"] = std::pair<int, op_load_t>(OP_ARGMAX, load_argmax);
+    op_load_map["ArgMin"] = std::pair<int, op_load_t>(OP_ARGMIN, load_argmin);
+    op_load_map["Asin"] = std::pair<int, op_load_t>(OP_UNARY, load_unary);
+    op_load_map["Atan"] = std::pair<int, op_load_t>(OP_UNARY, load_unary);
+    op_load_map["AveragePool"] = std::pair<int, op_load_t>(OP_POOL, load_pool);
+    op_load_map["Add"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["BatchNormalization"] = std::pair<int, op_load_t>(OP_BATCHNORM, load_bn);
+    op_load_map["Conv"] = std::pair<int, op_load_t>(OP_CONV, load_conv);
+    op_load_map["ConvTranspose"] = std::pair<int, op_load_t>(OP_DECONV, load_deconv);
+    op_load_map["Concat"] = std::pair<int, op_load_t>(OP_CONCAT, load_concat);
+    op_load_map["Clip"] = std::pair<int, op_load_t>(OP_CLIP, load_clip);
+    op_load_map["Ceil"] = std::pair<int, op_load_t>(OP_UNARY, load_unary);
+    op_load_map["Cos"] = std::pair<int, op_load_t>(OP_UNARY, load_unary);
+    op_load_map["Cast"] = std::pair<int, op_load_t>(OP_CAST, load_cast);
+    op_load_map["Dropout"] = std::pair<int, op_load_t>(OP_DROPOUT, load_no_param);
+    op_load_map["DepthToSpace"] = std::pair<int, op_load_t>(OP_DEPTHTOSPACE, load_depth_to_space);
+    op_load_map["Div"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["Elu"] = std::pair<int, op_load_t>(OP_ELU, load_elu);
+    op_load_map["Exp"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["Expand"] = std::pair<int, op_load_t>(OP_EXPAND, load_expand);
+    op_load_map["Equal"] = std::pair<int, op_load_t>(OP_COMPARISON, load_comparison);
+    op_load_map["Flatten"] = std::pair<int, op_load_t>(OP_FLATTEN, load_flatten);
+    op_load_map["Floor"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["Gemm"] = std::pair<int, op_load_t>(OP_GEMM, load_gemm);
+    op_load_map["Gather"] = std::pair<int, op_load_t>(OP_GATHER, load_gather);
+    op_load_map["Greater"] = std::pair<int, op_load_t>(OP_COMPARISON, load_comparison);
+    op_load_map["GlobalAveragePool"] = std::pair<int, op_load_t>(OP_POOL, load_pool);
+    op_load_map["HardSwish"] = std::pair<int, op_load_t>(OP_HARDSWISH, load_no_param);
+    op_load_map["HardSigmoid"] = std::pair<int, op_load_t>(OP_HARDSIGMOID, load_hard_sigmoid);
     op_load_map["InstanceNormalization"] = std::pair<int, op_load_t>(OP_INSTANCENORM, load_instance_norm);
-    op_load_map["Log"]                   = std::pair<int, op_load_t>(OP_UNARY,        load_unary);
-    op_load_map["LRN"]                   = std::pair<int, op_load_t>(OP_LRN,          load_LRN);
-    op_load_map["Less"]                  = std::pair<int, op_load_t>(OP_COMPARISON,   load_comparison);
-    op_load_map["LSTM"]                  = std::pair<int, op_load_t>(OP_LSTM,         load_LSTM);
-    op_load_map["LeakyRelu"]             = std::pair<int, op_load_t>(OP_RELU,         load_leaky_relu);
-    op_load_map["LogSoftmax"]            = std::pair<int, op_load_t>(OP_LOGSOFTMAX,   load_log_softmax);
-    op_load_map["Mul"]                   = std::pair<int, op_load_t>(OP_ELTWISE,      load_eltwise);
-    op_load_map["Max"]                   = std::pair<int, op_load_t>(OP_MAXIMUM,      load_no_param);
-    op_load_map["Min"]                   = std::pair<int, op_load_t>(OP_MINIMUM,      load_no_param);
-    op_load_map["Mean"]                  = std::pair<int, op_load_t>(OP_MEAN,         load_no_param);
-    op_load_map["MatMul"]                = std::pair<int, op_load_t>(OP_MATMUL,       load_matmul);
-    op_load_map["MaxPool"]               = std::pair<int, op_load_t>(OP_POOL,         load_pool);
-    op_load_map["Neg"]                   = std::pair<int, op_load_t>(OP_UNARY,        load_unary);
-    op_load_map["Or"]                    = std::pair<int, op_load_t>(OP_LOGICAL,      load_logical);
-    op_load_map["Pad"]                   = std::pair<int, op_load_t>(OP_PAD,          load_pad);
-    op_load_map["Pow"]                   = std::pair<int, op_load_t>(OP_ELTWISE,      load_eltwise);
-    op_load_map["PRelu"]                 = std::pair<int, op_load_t>(OP_PRELU,        load_no_param);
-    op_load_map["Relu"]                  = std::pair<int, op_load_t>(OP_RELU,         load_relu);
-    op_load_map["Resize"]                = std::pair<int, op_load_t>(OP_INTERP,       load_resize);
-    op_load_map["Reshape"]               = std::pair<int, op_load_t>(OP_RESHAPE,      load_reshape);
-    op_load_map["ReduceL2"]              = std::pair<int, op_load_t>(OP_REDUCEL2,     load_reducel2);
-    op_load_map["ReduceMean"]            = std::pair<int, op_load_t>(OP_REDUCTION,    load_reduce);
-    op_load_map["ReduceLogSumExp"]       = std::pair<int, op_load_t>(OP_REDUCTION,    load_reduce);
-    op_load_map["ReduceLogSum"]          = std::pair<int, op_load_t>(OP_REDUCTION,    load_reduce);
-    op_load_map["ReduceMax"]             = std::pair<int, op_load_t>(OP_REDUCTION,    load_reduce);
-    op_load_map["ReduceMin"]             = std::pair<int, op_load_t>(OP_REDUCTION,    load_reduce);
-    op_load_map["ReduceProd"]            = std::pair<int, op_load_t>(OP_REDUCTION,    load_reduce);
-    op_load_map["ReduceSumSquare"]       = std::pair<int, op_load_t>(OP_REDUCTION,    load_reduce);
-    op_load_map["ReduceSum"]             = std::pair<int, op_load_t>(OP_REDUCTION,    load_reduce);
-    op_load_map["Reciprocal"]            = std::pair<int, op_load_t>(OP_RECIPROCAL,   load_no_param);
-    op_load_map["Sub"]                   = std::pair<int, op_load_t>(OP_ELTWISE,      load_eltwise);
-    op_load_map["Selu"]                  = std::pair<int, op_load_t>(OP_SELU,         load_selu);
-    op_load_map["Sqrt"]                  = std::pair<int, op_load_t>(OP_ELTWISE,      load_eltwise);
-    op_load_map["Slice"]                 = std::pair<int, op_load_t>(OP_SLICE,        load_slice);
-    op_load_map["Split"]                 = std::pair<int, op_load_t>(OP_SPLIT,        load_split);
-    op_load_map["Shape"]                 = std::pair<int, op_load_t>(OP_SHAPE,        load_no_param);
-    op_load_map["Squeeze"]               = std::pair<int, op_load_t>(OP_SQUEEZE,      load_squeeze);
-    op_load_map["Scatter"]               = std::pair<int, op_load_t>(OP_SCATTER,      load_scatter);
-    op_load_map["Sigmoid"]               = std::pair<int, op_load_t>(OP_SIGMOID,      load_no_param);
-    op_load_map["Softmax"]               = std::pair<int, op_load_t>(OP_SOFTMAX,      load_softmax);
-    op_load_map["Softplus"]              = std::pair<int, op_load_t>(OP_SOFTPLUS,     load_no_param);
-    op_load_map["Tanh"]                  = std::pair<int, op_load_t>(OP_TANH,         load_no_param);
-    op_load_map["Tile"]                  = std::pair<int, op_load_t>(OP_TILE,         load_tile);
-    op_load_map["Transpose"]             = std::pair<int, op_load_t>(OP_TRANSPOSE,    load_transpose);
-    op_load_map["Upsample"]              = std::pair<int, op_load_t>(OP_INTERP,       load_interp);
-    op_load_map["Unsqueeze"]             = std::pair<int, op_load_t>(OP_UNSQUEEZE,    load_unsqueeze);
-    op_load_map["Where"]                 = std::pair<int, op_load_t>(OP_WHERE,        load_no_param);
+    op_load_map["Log"] = std::pair<int, op_load_t>(OP_UNARY, load_unary);
+    op_load_map["LRN"] = std::pair<int, op_load_t>(OP_LRN, load_LRN);
+    op_load_map["Less"] = std::pair<int, op_load_t>(OP_COMPARISON, load_comparison);
+    op_load_map["LSTM"] = std::pair<int, op_load_t>(OP_LSTM, load_LSTM);
+    op_load_map["LeakyRelu"] = std::pair<int, op_load_t>(OP_RELU, load_leaky_relu);
+    op_load_map["LogSoftmax"] = std::pair<int, op_load_t>(OP_LOGSOFTMAX, load_log_softmax);
+    op_load_map["Mul"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["Max"] = std::pair<int, op_load_t>(OP_MAXIMUM, load_no_param);
+    op_load_map["Min"] = std::pair<int, op_load_t>(OP_MINIMUM, load_no_param);
+    op_load_map["Mean"] = std::pair<int, op_load_t>(OP_MEAN, load_no_param);
+    op_load_map["MatMul"] = std::pair<int, op_load_t>(OP_MATMUL, load_matmul);
+    op_load_map["MaxPool"] = std::pair<int, op_load_t>(OP_POOL, load_pool);
+    op_load_map["Neg"] = std::pair<int, op_load_t>(OP_UNARY, load_unary);
+    op_load_map["Or"] = std::pair<int, op_load_t>(OP_LOGICAL, load_logical);
+    op_load_map["Pad"] = std::pair<int, op_load_t>(OP_PAD, load_pad);
+    op_load_map["Pow"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["PRelu"] = std::pair<int, op_load_t>(OP_PRELU, load_no_param);
+    op_load_map["Relu"] = std::pair<int, op_load_t>(OP_RELU, load_relu);
+    op_load_map["Resize"] = std::pair<int, op_load_t>(OP_INTERP, load_resize);
+    op_load_map["Reshape"] = std::pair<int, op_load_t>(OP_RESHAPE, load_reshape);
+    op_load_map["ReduceL2"] = std::pair<int, op_load_t>(OP_REDUCEL2, load_reducel2);
+    op_load_map["ReduceMean"] = std::pair<int, op_load_t>(OP_REDUCTION, load_reduce);
+    op_load_map["ReduceLogSumExp"] = std::pair<int, op_load_t>(OP_REDUCTION, load_reduce);
+    op_load_map["ReduceLogSum"] = std::pair<int, op_load_t>(OP_REDUCTION, load_reduce);
+    op_load_map["ReduceMax"] = std::pair<int, op_load_t>(OP_REDUCTION, load_reduce);
+    op_load_map["ReduceMin"] = std::pair<int, op_load_t>(OP_REDUCTION, load_reduce);
+    op_load_map["ReduceProd"] = std::pair<int, op_load_t>(OP_REDUCTION, load_reduce);
+    op_load_map["ReduceSumSquare"] = std::pair<int, op_load_t>(OP_REDUCTION, load_reduce);
+    op_load_map["ReduceSum"] = std::pair<int, op_load_t>(OP_REDUCTION, load_reduce);
+    op_load_map["Reciprocal"] = std::pair<int, op_load_t>(OP_RECIPROCAL, load_no_param);
+    op_load_map["Sub"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["Selu"] = std::pair<int, op_load_t>(OP_SELU, load_selu);
+    op_load_map["Sqrt"] = std::pair<int, op_load_t>(OP_ELTWISE, load_eltwise);
+    op_load_map["Slice"] = std::pair<int, op_load_t>(OP_SLICE, load_slice);
+    op_load_map["Split"] = std::pair<int, op_load_t>(OP_SPLIT, load_split);
+    op_load_map["Shape"] = std::pair<int, op_load_t>(OP_SHAPE, load_no_param);
+    op_load_map["Squeeze"] = std::pair<int, op_load_t>(OP_SQUEEZE, load_squeeze);
+    op_load_map["Scatter"] = std::pair<int, op_load_t>(OP_SCATTER, load_scatter);
+    op_load_map["Sigmoid"] = std::pair<int, op_load_t>(OP_SIGMOID, load_no_param);
+    op_load_map["Softmax"] = std::pair<int, op_load_t>(OP_SOFTMAX, load_softmax);
+    op_load_map["Softplus"] = std::pair<int, op_load_t>(OP_SOFTPLUS, load_no_param);
+    op_load_map["Tanh"] = std::pair<int, op_load_t>(OP_TANH, load_no_param);
+    op_load_map["Tile"] = std::pair<int, op_load_t>(OP_TILE, load_tile);
+    op_load_map["Transpose"] = std::pair<int, op_load_t>(OP_TRANSPOSE, load_transpose);
+    op_load_map["Upsample"] = std::pair<int, op_load_t>(OP_INTERP, load_interp);
+    op_load_map["Unsqueeze"] = std::pair<int, op_load_t>(OP_UNSQUEEZE, load_unsqueeze);
+    op_load_map["Where"] = std::pair<int, op_load_t>(OP_WHERE, load_no_param);
 }
 /*
 *   OPERAOTR REGISTER FUNCTION DEFINE FOR ONNX SERIALIZER END
diff --git a/tools/convert_tool/onnx/onnx2tengine.hpp b/tools/convert_tool/onnx/onnx2tengine.hpp
index 17e52ec50..50df1b83a 100644
--- a/tools/convert_tool/onnx/onnx2tengine.hpp
+++ b/tools/convert_tool/onnx/onnx2tengine.hpp
@@ -37,19 +37,18 @@
 #include <google/protobuf/text_format.h>
 #include <google/protobuf/message.h>
 
-extern "C" 
-{
-    #include "tengine/c_api.h"
-    #include "graph/graph.h"
-    #include "graph/subgraph.h"
-    #include "graph/node.h"
-    #include "graph/tensor.h"
-    #include "executer/executer.h"
-    #include "module/module.h"
-    #include "utility/log.h"
-    #include "utility/sys_port.h"
-    #include "utility/vector.h"
-    #include "../utils/save_graph/op_include.h"
+extern "C" {
+#include "tengine/c_api.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "executer/executer.h"
+#include "module/module.h"
+#include "utility/log.h"
+#include "utility/sys_port.h"
+#include "utility/vector.h"
+#include "../utils/save_graph/op_include.h"
 }
 
 class onnx_serializer
@@ -59,19 +58,17 @@ class onnx_serializer
     typedef int (*op_load_t)(ir_graph_t* graph, ir_node_t* node, const onnx::NodeProto& onnx_node);
 
 private:
-    std::unordered_map<std::string, std::pair<int, op_load_t>> op_load_map;
+    std::unordered_map<std::string, std::pair<int, op_load_t> > op_load_map;
     int load_model(ir_graph_t* graph, std::string model_file);
     int set_graph_output(ir_graph_t* graph, const onnx::GraphProto& onnx_graph);
     int load_graph_node(ir_graph_t* graph, const onnx::GraphProto& onnx_graph);
     int set_graph_input(ir_graph_t* graph, const onnx::GraphProto& onnx_graph);
     int load_initializer_tensor(ir_graph_t* graph, const onnx::GraphProto& onnx_graph);
     int load_constant_tensor(ir_graph_t* graph, const onnx::GraphProto& onnx_graph);
-    int load_model_file(std::string model_file, onnx::ModelProto &model);
+    int load_model_file(std::string model_file, onnx::ModelProto& model);
     bool find_op_load_method(const std::string& op_name);
     void register_op_load();
     std::unordered_map<std::string, int> tensor_check;
 };
 
-
-
 #endif
\ No newline at end of file
diff --git a/tools/convert_tool/utils/graph_optimizer/graph_opt.cpp b/tools/convert_tool/utils/graph_optimizer/graph_opt.cpp
index d7ba7264d..ef46266f7 100644
--- a/tools/convert_tool/utils/graph_optimizer/graph_opt.cpp
+++ b/tools/convert_tool/utils/graph_optimizer/graph_opt.cpp
@@ -50,7 +50,7 @@ static int erase_tensor_id(ir_graph_t* graph, int16_t id)
             node->output_tensors[j] = old_new_id[node->output_tensors[j]];
         }
     }
-    
+
     ir_tensor_t** new_tensor_list = (ir_tensor_t**)sys_realloc(graph->tensor_list, sizeof(ir_tensor_t*) * (graph->tensor_num - 1));
     graph->tensor_list = new_tensor_list;
     graph->tensor_num--;
@@ -68,7 +68,7 @@ static int erase_node_id(ir_graph_t* graph, int16_t id)
     for (size_t i = 0; i < graph->node_num; i++)
     {
         if (i == id) continue;
-        
+
         ir_node_t* node = get_ir_graph_node(graph, i);
         node->index = j;
         graph->node_list[j] = graph->node_list[i];
@@ -93,7 +93,7 @@ static int erase_node_id(ir_graph_t* graph, int16_t id)
     {
         graph->output_nodes[i] = old_new_id[graph->output_nodes[i]];
     }
-    
+
     ir_node_t** new_node_list = (ir_node_t**)sys_realloc(graph->node_list, sizeof(ir_node_t*) * (graph->node_num - 1));
     graph->node_list = new_node_list;
     graph->node_num--;
@@ -124,7 +124,7 @@ static int delete_node(ir_graph_t* graph, int16_t pre_node_id, int16_t del_node_
         }
     }
     pre_output_tensor->consumer_num = del_output_tensor->consumer_num;
-    
+
     /* delete node */
     if (erase_tensor_id(graph, del_node->output_tensors[0]) < 0 || erase_node_id(graph, del_node->index) < 0)
     {
@@ -173,7 +173,7 @@ static int insert_node_id(ir_graph_t* graph, int16_t insert_node_id, int16_t ins
     {
         graph->output_nodes[i] = old_new_id[graph->output_nodes[i]];
     }
-    
+
     return 0;
 }
 
@@ -226,7 +226,7 @@ static int add_node(ir_graph_t* graph, int16_t down_node_id, int add_node_type,
         if (tensor->tensor_type == TENSOR_TYPE_VAR)
             up_nodes.push_back(tensor->producer);
     }
-    
+
     /* create node and its own tensor */
     ir_node_t* add_node = create_ir_node(graph, name, add_node_type, 1);
     if (add_node == nullptr)
@@ -242,7 +242,7 @@ static int add_node(ir_graph_t* graph, int16_t down_node_id, int add_node_type,
     {
         ir_node_t* up_node = get_ir_graph_node(graph, up_nodes[i]);
         ir_tensor_t* up_node_output_tensor = get_ir_graph_tensor(graph, up_node->output_tensors[0]);
-        for (size_t i = 0; i <up_node_output_tensor->consumer_num; i++)
+        for (size_t i = 0; i < up_node_output_tensor->consumer_num; i++)
         {
             if (up_node_output_tensor->consumer[i] == down_node_id)
                 up_node_output_tensor->consumer[i] = add_node->index;
@@ -252,7 +252,7 @@ static int add_node(ir_graph_t* graph, int16_t down_node_id, int add_node_type,
     down_node->input_tensors[0] = add_tensor->index;
     add_tensor->consumer[0] = down_node_id;
     add_tensor->consumer_num = 1;
-    
+
     /* insert node id */
     if (insert_node_id(graph, add_node->index, down_node_id) < 0)
         return -1;
@@ -265,7 +265,7 @@ static int add_node(ir_graph_t* graph, int16_t down_node_id, int add_node_type,
 }
 
 static int weight_bn(ir_graph_t* graph, ir_node_t* conv_node, float* mean, float* var, float* gamma, float* beta, float eps,
-                      float rescale_factor, ir_tensor_t* bias_tensor)
+                     float rescale_factor, ir_tensor_t* bias_tensor)
 {
     ir_tensor_t* kernel_tensor = get_ir_graph_tensor(graph, conv_node->input_tensors[1]);
     struct conv_param* param = (struct conv_param*)conv_node->op.param_mem;
@@ -278,9 +278,9 @@ static int weight_bn(ir_graph_t* graph, ir_node_t* conv_node, float* mean, float
     int kernel_size = input_chan * kernel_x * kernel_y;
     float* kernel_data = (float*)kernel_tensor->data;
     int channel_num = kernel_tensor->dims[0];
-    
-    float* scale_mean = ( float* )malloc(channel_num * sizeof(float));
-    float* scale_var_inv = ( float* )malloc(channel_num * sizeof(float));
+
+    float* scale_mean = (float*)malloc(channel_num * sizeof(float));
+    float* scale_var_inv = (float*)malloc(channel_num * sizeof(float));
 
     float rescale_factor_tmp = rescale_factor;
     float* bias = NULL;
@@ -313,7 +313,7 @@ static int weight_bn(ir_graph_t* graph, ir_node_t* conv_node, float* mean, float
         insert_node_id(graph, bias_node->index, kernel_tensor->producer);
         insert_tensor_id(graph, bias_tensor->index, kernel_tensor->index);
     }
-    
+
     rescale_factor_tmp = rescale_factor_tmp ? 1 / rescale_factor_tmp : 0;
 
     if (NULL == bias)
@@ -385,9 +385,9 @@ static int fc_weight_bn(ir_graph_t* graph, ir_node_t* fc_node, float* mean, floa
     int channel_num = kernel_tensor->dims[0];
     int total_size = kernel_tensor->dims[1];
     int kernel_size = total_size;
-    
-    float* scale_mean = ( float* )malloc(channel_num * sizeof(float));
-    float* scale_var_inv = ( float* )malloc(channel_num * sizeof(float));
+
+    float* scale_mean = (float*)malloc(channel_num * sizeof(float));
+    float* scale_var_inv = (float*)malloc(channel_num * sizeof(float));
 
     float rescale_factor_tmp = rescale_factor;
     float* bias = NULL;
@@ -420,7 +420,7 @@ static int fc_weight_bn(ir_graph_t* graph, ir_node_t* fc_node, float* mean, floa
         insert_node_id(graph, bias_node->index, kernel_tensor->producer);
         insert_tensor_id(graph, bias_tensor->index, kernel_tensor->index);
     }
-    
+
     rescale_factor_tmp = rescale_factor_tmp ? 1 / rescale_factor_tmp : 0;
 
     if (NULL == bias)
@@ -460,7 +460,7 @@ static int fc_weight_bn(ir_graph_t* graph, ir_node_t* fc_node, float* mean, floa
         float w_scale = scale_var_inv[o_c];
         for (int i = 0; i < kernel_size; i++)
         {
-            kernel_data[o_c * kernel_size + i] = kernel_data[o_c * kernel_size + i] * w_scale ;
+            kernel_data[o_c * kernel_size + i] = kernel_data[o_c * kernel_size + i] * w_scale;
         }
     }
 
@@ -493,7 +493,7 @@ static int change_node_op(ir_node_t* node, int new_op_type)
 static int fuse_conv_relu_common(ir_graph_t* graph)
 {
     /* get all conv-relu chain */
-    std::vector<std::pair<ir_node_t*, ir_node_t*>> conv_relu_v;
+    std::vector<std::pair<ir_node_t*, ir_node_t*> > conv_relu_v;
     for (size_t i = 0; i < graph->node_num; i++)
     {
         ir_node_t* relu_node = get_ir_graph_node(graph, i);
@@ -501,7 +501,7 @@ static int fuse_conv_relu_common(ir_graph_t* graph)
             continue;
         if (relu_node->op.type == OP_RELU)
         {
-            struct relu_param* relu_param =(struct relu_param*)relu_node->op.param_mem;
+            struct relu_param* relu_param = (struct relu_param*)relu_node->op.param_mem;
             if (relu_param->negative_slope != 0.f)
                 continue;
         }
@@ -516,7 +516,7 @@ static int fuse_conv_relu_common(ir_graph_t* graph)
     }
 
     /* fused */
-    for (auto& conv_relu:conv_relu_v)
+    for (auto& conv_relu : conv_relu_v)
     {
         ir_node_t* conv_node = conv_relu.first;
         ir_node_t* relu_node = conv_relu.second;
@@ -525,7 +525,7 @@ static int fuse_conv_relu_common(ir_graph_t* graph)
             conv_param->activation = 0;
         if (relu_node->op.type == OP_RELU6)
             conv_param->activation = 6;
-        
+
         /* delete relu node */
         if (delete_node(graph, conv_node->index, relu_node->index) < 0)
         {
@@ -533,14 +533,14 @@ static int fuse_conv_relu_common(ir_graph_t* graph)
             return -1;
         }
     }
-    
+
     return 0;
 }
 
 static int fuse_relu_eltwise(ir_graph_t* graph)
 {
     /* get all relu-eltwise chain */
-    std::vector<std::pair<ir_node_t*, ir_node_t*>> relu_eltwise_v;
+    std::vector<std::pair<ir_node_t*, ir_node_t*> > relu_eltwise_v;
     for (size_t i = 0; i < graph->node_num; i++)
     {
         ir_node_t* elt_node = get_ir_graph_node(graph, i);
@@ -548,8 +548,8 @@ static int fuse_relu_eltwise(ir_graph_t* graph)
             continue;
         struct eltwise_param* elt_param = (struct eltwise_param*)elt_node->op.param_mem;
         if (elt_param->type != ELT_MIN_SCALAR)
-            continue;       // todo: verify 6
-        
+            continue; // todo: verify 6
+
         /*Check if it is a  relu + minimum*/
         ir_tensor_t* relu_tensor = get_ir_graph_tensor(graph, elt_node->input_tensors[0]);
         ir_node_t* relu_node = get_ir_graph_node(graph, relu_tensor->producer);
@@ -559,12 +559,12 @@ static int fuse_relu_eltwise(ir_graph_t* graph)
     }
 
     /* fused */
-    for (auto& relu_elt:relu_eltwise_v)
+    for (auto& relu_elt : relu_eltwise_v)
     {
         ir_node_t* relu_node = relu_elt.first;
         ir_node_t* elt_node = relu_elt.second;
         relu_node->op.type = OP_RELU6;
-        
+
         /* delete elt node */
         if (delete_node(graph, relu_node->index, elt_node->index) < 0)
         {
@@ -572,20 +572,20 @@ static int fuse_relu_eltwise(ir_graph_t* graph)
             return -1;
         }
     }
-    
+
     return 0;
 }
 
 static int fuse_bn_scale(ir_graph_t* graph)
 {
     /* get all bn-scale chain */
-    std::vector<std::pair<ir_node_t*, ir_node_t*>> bn_scale_v;
+    std::vector<std::pair<ir_node_t*, ir_node_t*> > bn_scale_v;
     for (size_t i = 0; i < graph->node_num; i++)
     {
         ir_node_t* scale_node = get_ir_graph_node(graph, i);
         if (scale_node->op.type != OP_SCALE)
             continue;
-        
+
         /*Check if it is a  bn + scale*/
         ir_tensor_t* bn_tensor = get_ir_graph_tensor(graph, scale_node->input_tensors[0]);
         ir_node_t* bn_node = get_ir_graph_node(graph, bn_tensor->producer);
@@ -595,7 +595,7 @@ static int fuse_bn_scale(ir_graph_t* graph)
     }
 
     /* fused */
-    for (auto& bn_scale:bn_scale_v)
+    for (auto& bn_scale : bn_scale_v)
     {
         ir_node_t* bn_node = bn_scale.first;
         ir_node_t* scale_node = bn_scale.second;
@@ -610,7 +610,7 @@ static int fuse_bn_scale(ir_graph_t* graph)
 
         struct batchnorm_param* param = (struct batchnorm_param*)bn_node->op.param_mem;
         param->caffe_flavor = 0;
-        
+
         /* delete scale node */
         if (delete_node(graph, bn_node->index, scale_node->index) < 0)
         {
@@ -618,20 +618,20 @@ static int fuse_bn_scale(ir_graph_t* graph)
             return -1;
         }
     }
-    
+
     return 0;
 }
 
 static int fuse_conv_bn(ir_graph_t* graph)
 {
     /* get all conv-bn chain */
-    std::vector<std::pair<ir_node_t*, ir_node_t*>> conv_bn_v;
+    std::vector<std::pair<ir_node_t*, ir_node_t*> > conv_bn_v;
     for (size_t i = 0; i < graph->node_num; i++)
     {
         ir_node_t* bn_node = get_ir_graph_node(graph, i);
         if (bn_node->op.type != OP_BATCHNORM)
             continue;
-        
+
         /*Check if it is a  conv + bn*/
         ir_tensor_t* conv_tensor = get_ir_graph_tensor(graph, bn_node->input_tensors[0]);
         ir_node_t* conv_node = get_ir_graph_node(graph, conv_tensor->producer);
@@ -641,23 +641,23 @@ static int fuse_conv_bn(ir_graph_t* graph)
     }
 
     /* fused */
-    for (auto& conv_bn:conv_bn_v)
+    for (auto& conv_bn : conv_bn_v)
     {
         ir_node_t* conv_node = conv_bn.first;
         ir_node_t* bn_node = conv_bn.second;
         struct batchnorm_param* bn_param = (struct batchnorm_param*)bn_node->op.param_mem;
         ir_tensor_t* bn_mean = get_ir_graph_tensor(graph, bn_node->input_tensors[3]);
-        ir_tensor_t* bn_var  = get_ir_graph_tensor(graph, bn_node->input_tensors[4]);
+        ir_tensor_t* bn_var = get_ir_graph_tensor(graph, bn_node->input_tensors[4]);
 
         float* mean = (float*)bn_mean->data;
         float* var = (float*)bn_var->data;
         float* gamma = NULL;
         float* beta = NULL;
 
-        if(!bn_param->caffe_flavor)
+        if (!bn_param->caffe_flavor)
         {
             ir_tensor_t* bn_gamma = get_ir_graph_tensor(graph, bn_node->input_tensors[1]);
-            ir_tensor_t* bn_beta  = get_ir_graph_tensor(graph, bn_node->input_tensors[2]);
+            ir_tensor_t* bn_beta = get_ir_graph_tensor(graph, bn_node->input_tensors[2]);
             gamma = (float*)bn_gamma->data;
             beta = (float*)bn_beta->data;
         }
@@ -665,9 +665,9 @@ static int fuse_conv_bn(ir_graph_t* graph)
         ir_tensor_t* bias_tensor = nullptr;
         if (conv_node->input_num > 2)
             bias_tensor = get_ir_graph_tensor(graph, conv_node->input_tensors[2]);
-        
+
         weight_bn(graph, conv_node, mean, var, gamma, beta, bn_param->eps, bn_param->rescale_factor, bias_tensor);
-        
+
         /* delete elt node */
         if (delete_node(graph, conv_node->index, bn_node->index) < 0)
         {
@@ -675,20 +675,20 @@ static int fuse_conv_bn(ir_graph_t* graph)
             return -1;
         }
     }
-    
+
     return 0;
 }
 
 static int fuse_fc_bn(ir_graph_t* graph)
 {
     /* get all fc-bn chain */
-    std::vector<std::pair<ir_node_t*, ir_node_t*>> fc_bn_v;
+    std::vector<std::pair<ir_node_t*, ir_node_t*> > fc_bn_v;
     for (size_t i = 0; i < graph->node_num; i++)
     {
         ir_node_t* bn_node = get_ir_graph_node(graph, i);
         if (bn_node->op.type != OP_BATCHNORM)
             continue;
-        
+
         /*Check if it is a  fc + bn*/
         ir_tensor_t* fc_tensor = get_ir_graph_tensor(graph, bn_node->input_tensors[0]);
         ir_node_t* fc_node = get_ir_graph_node(graph, fc_tensor->producer);
@@ -698,23 +698,23 @@ static int fuse_fc_bn(ir_graph_t* graph)
     }
 
     /* fused */
-    for (auto& fc_bn:fc_bn_v)
+    for (auto& fc_bn : fc_bn_v)
     {
         ir_node_t* fc_node = fc_bn.first;
         ir_node_t* bn_node = fc_bn.second;
         struct batchnorm_param* bn_param = (struct batchnorm_param*)bn_node->op.param_mem;
         ir_tensor_t* bn_mean = get_ir_graph_tensor(graph, bn_node->input_tensors[3]);
-        ir_tensor_t* bn_var  = get_ir_graph_tensor(graph, bn_node->input_tensors[4]);
+        ir_tensor_t* bn_var = get_ir_graph_tensor(graph, bn_node->input_tensors[4]);
 
         float* mean = (float*)bn_mean->data;
         float* var = (float*)bn_var->data;
         float* gamma = NULL;
         float* beta = NULL;
 
-        if(!bn_param->caffe_flavor)
+        if (!bn_param->caffe_flavor)
         {
             ir_tensor_t* bn_gamma = get_ir_graph_tensor(graph, bn_node->input_tensors[1]);
-            ir_tensor_t* bn_beta  = get_ir_graph_tensor(graph, bn_node->input_tensors[2]);
+            ir_tensor_t* bn_beta = get_ir_graph_tensor(graph, bn_node->input_tensors[2]);
             gamma = (float*)bn_gamma->data;
             beta = (float*)bn_beta->data;
         }
@@ -722,7 +722,7 @@ static int fuse_fc_bn(ir_graph_t* graph)
         ir_tensor_t* bias_tensor = nullptr;
         if (fc_node->input_num > 2)
             bias_tensor = get_ir_graph_tensor(graph, fc_node->input_tensors[2]);
-        
+
         fc_weight_bn(graph, fc_node, mean, var, gamma, beta, bn_param->eps, bn_param->rescale_factor, bias_tensor);
 
         /* delete bn node */
@@ -732,14 +732,14 @@ static int fuse_fc_bn(ir_graph_t* graph)
             return -1;
         }
     }
-    
+
     return 0;
 }
 
 static int fuse_conv_unsqueeze(ir_graph_t* graph)
 {
     /* get all unsqueeze conv|fc eltwise chain */
-    std::vector<std::vector<ir_node_t*>> fused_nodes;
+    std::vector<std::vector<ir_node_t*> > fused_nodes;
     for (size_t i = 0; i < graph->node_num; i++)
     {
         ir_node_t* elt_node = get_ir_graph_node(graph, i);
@@ -748,7 +748,7 @@ static int fuse_conv_unsqueeze(ir_graph_t* graph)
         struct eltwise_param* param = (struct eltwise_param*)elt_node->op.param_mem;
         if (elt_node->input_num != 2 || param->type != ELT_SUM) // unsqueeze and conv|fc
             continue;
-        
+
         /* Check if it is a  (unsqueeze conv|fc) + eltwise */
         ir_tensor_t* conv_tensor = get_ir_graph_tensor(graph, elt_node->input_tensors[0]);
         ir_tensor_t* unsq_tensor = get_ir_graph_tensor(graph, elt_node->input_tensors[1]);
@@ -766,7 +766,7 @@ static int fuse_conv_unsqueeze(ir_graph_t* graph)
         ir_node_t* conv_or_fc_node = fused_nodes[i][0];
         ir_node_t* unsq_node = fused_nodes[i][1];
         ir_node_t* elt_node = fused_nodes[i][2];
-        
+
         ir_tensor_t* bias_tensor = get_ir_graph_tensor(graph, unsq_node->input_tensors[0]);
         set_ir_node_input_tensor(conv_or_fc_node, conv_or_fc_node->input_num, bias_tensor);
         bias_tensor->consumer[0] = conv_or_fc_node->index;
@@ -778,7 +778,7 @@ static int fuse_conv_unsqueeze(ir_graph_t* graph)
             fprintf(stderr, "delete node:%s failed.\n", unsq_node->name);
             return -1;
         }
-        
+
         /* delete elt node */
         if (delete_node(graph, conv_or_fc_node->index, elt_node->index) < 0)
         {
@@ -786,7 +786,7 @@ static int fuse_conv_unsqueeze(ir_graph_t* graph)
             return -1;
         }
     }
-    
+
     return 0;
 }
 
diff --git a/tools/convert_tool/utils/graph_optimizer/graph_opt.hpp b/tools/convert_tool/utils/graph_optimizer/graph_opt.hpp
index c867f853b..8d5c67e8d 100644
--- a/tools/convert_tool/utils/graph_optimizer/graph_opt.hpp
+++ b/tools/convert_tool/utils/graph_optimizer/graph_opt.hpp
@@ -7,21 +7,20 @@
 #include "string.h"
 #include <string>
 #include "math.h"
-extern "C" 
-{
-    #include "tengine/c_api.h"
-    #include "graph/graph.h"
-    #include "graph/node.h"
-    #include "graph/tensor.h"
-    #include "module/module.h"
-    #include "utility/log.h"
-    #include "utility/sys_port.h"
-    
-    #include "convolution_param.h"
-    #include "relu_param.h"
-    #include "eltwise_param.h"
-    #include "batchnorm_param.h"
-    #include "fc_param.h"
+extern "C" {
+#include "tengine/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "module/module.h"
+#include "utility/log.h"
+#include "utility/sys_port.h"
+
+#include "convolution_param.h"
+#include "relu_param.h"
+#include "eltwise_param.h"
+#include "batchnorm_param.h"
+#include "fc_param.h"
 }
 
 int graph_opt(graph_t graph);
diff --git a/tools/convert_tool/utils/save_graph/save_graph.cpp b/tools/convert_tool/utils/save_graph/save_graph.cpp
index 2a3ba3346..71a5efb8b 100644
--- a/tools/convert_tool/utils/save_graph/save_graph.cpp
+++ b/tools/convert_tool/utils/save_graph/save_graph.cpp
@@ -35,7 +35,7 @@ bool IsSaveString(void)
 {
     const char* env = std::getenv("TM_NO_STRING");
 
-    if(env)
+    if (env)
         return false;
     else
         return true;
@@ -45,7 +45,7 @@ bool IsSaveData(void)
 {
     const char* env = std::getenv("TM_FOR_BENCHMARK");
 
-    if(env)
+    if (env)
         return false;
     else
         return true;
@@ -53,7 +53,7 @@ bool IsSaveData(void)
 
 bool RegisterOpSaveMethod(const uint16_t& op_type, const op_save_t& save_func)
 {
-    if(op_save_map_.count(op_type))
+    if (op_save_map_.count(op_type))
         return false;
 
     op_save_map_[op_type] = save_func;
@@ -61,7 +61,7 @@ bool RegisterOpSaveMethod(const uint16_t& op_type, const op_save_t& save_func)
 }
 
 tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tensor_t* tensor,
-                                         unsigned int tensor_id, unsigned int buffer_id)
+                          unsigned int tensor_id, unsigned int buffer_id)
 {
     TM2_Tensor tm_tensor;
     tm_tensor.tensor_id = tensor_id;
@@ -72,11 +72,11 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso
 
     bool tm_with_string = IsSaveString();
 
-    if(tm_with_string)
+    if (tm_with_string)
     {
         std::string name = tensor->name;
         TM2_String tensor_name;
-        tensor_name.size = name.size() + 1;    // including trailing \0
+        tensor_name.size = name.size() + 1; // including trailing \0
         tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size);
         tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM2_String));
     }
@@ -86,13 +86,13 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso
     /* Get the dims of the tensor */
     int* dim = tensor->dims;
     size_t vector_size;
-    if(tensor->dim_num)
+    if (tensor->dim_num)
     {
         /* Write the vector of dims */
         vector_size = sizeof(tm_size_t) + sizeof(int32_t) * tensor->dim_num;
-        TM2_Vector_dims* v_dims = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_dims = (TM2_Vector_dims*)malloc(vector_size);
         v_dims->v_num = tensor->dim_num;
-        for(unsigned int i = 0; i < tensor->dim_num; i++)
+        for (unsigned int i = 0; i < tensor->dim_num; i++)
         {
             v_dims->dims[i] = dim[i];
         }
@@ -103,10 +103,10 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso
         tm_tensor.offset_vd_dims = TM2_NOT_SET;
 
     /* Write the quant params */
-    if(tensor->quant_param_num != 0)
+    if (tensor->quant_param_num != 0)
     {
         vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor->quant_param_num;
-        TM2_Vector_offsets* v_qtparams = ( TM2_Vector_offsets* )malloc(vector_size);
+        TM2_Vector_offsets* v_qtparams = (TM2_Vector_offsets*)malloc(vector_size);
         v_qtparams->v_num = tensor->quant_param_num;
         if (v_qtparams->v_num == 1)
         {
@@ -117,7 +117,7 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso
         }
         else if (v_qtparams->v_num > 1)
         {
-            for(unsigned int i = 0; i < v_qtparams->v_num; i++)
+            for (unsigned int i = 0; i < v_qtparams->v_num; i++)
             {
                 TM2_QuantParam qtparam;
                 qtparam.zero_point = tensor->zp_list[i];
@@ -126,7 +126,6 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso
                 v_qtparams->offsets[i] = WriteTmObject(start_ptr, cur_pos, &qtparam, sizeof(TM2_QuantParam));
             }
         }
-        
 
         /* Write the vector of quant params */
         tm_tensor.offect_vo_quantparams = WriteTmObject(start_ptr, cur_pos, v_qtparams, vector_size);
@@ -139,21 +138,20 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, ir_tenso
 }
 
 tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, ir_graph_t* graph, ir_node_t* node,
-                                       name_map_t& tensor_name_map)
+                        name_map_t& tensor_name_map)
 {
-    
     TM2_Node tm_node;
-    memset(&tm_node, 0 , sizeof(TM2_Node));
+    memset(&tm_node, 0, sizeof(TM2_Node));
     tm_node.node_id = node->index;
     tm_node.dynamic_shape = node->dynamic_shape;
 
     bool tm_with_string = IsSaveString();
 
-    if(tm_with_string)
+    if (tm_with_string)
     {
         std::string name = node->name;
         TM2_String node_name;
-        node_name.size = name.size() + 1;    // including trailing \0
+        node_name.size = name.size() + 1; // including trailing \0
         node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size);
         tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM2_String));
     }
@@ -163,13 +161,13 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, ir_graph_t
     unsigned int input_num = node->input_num;
     unsigned int output_num = node->output_num;
 
-    if(input_num)
+    if (input_num)
     {
         /* Write the vector of input indices */
         size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num;
-        TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size);
+        TM2_Vector_indices* v_input_indices = (TM2_Vector_indices*)malloc(vector_size);
         v_input_indices->v_num = input_num;
-        for(unsigned int i = 0; i < input_num; i++)
+        for (unsigned int i = 0; i < input_num; i++)
         {
             ir_tensor_t* p_tensor = get_ir_graph_tensor(graph, node->input_tensors[i]);
             v_input_indices->indices[i] = tensor_name_map[p_tensor->name];
@@ -180,13 +178,13 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, ir_graph_t
     else
         tm_node.offset_vi_input_tensors = TM2_NOT_SET;
 
-    if(output_num)
+    if (output_num)
     {
         /* Write the vector of output indices */
         size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num;
-        TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size);
+        TM2_Vector_indices* v_output_indices = (TM2_Vector_indices*)malloc(vector_size);
         v_output_indices->v_num = output_num;
-        for(unsigned int i = 0; i < output_num; i++)
+        for (unsigned int i = 0; i < output_num; i++)
         {
             ir_tensor_t* p_tensor = get_ir_graph_tensor(graph, node->output_tensors[i]);
             v_output_indices->indices[i] = tensor_name_map[p_tensor->name];
@@ -199,7 +197,7 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, ir_graph_t
 
     /* Write tm operator */
     uint16_t op_type = node->op.type;
-    if(!op_save_map_.count(op_type))
+    if (!op_save_map_.count(op_type))
     {
         TLOG_ERR("cannot find save function for operator:%d \n", op_type);
         return false;
@@ -230,12 +228,12 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra
     bool tm_no_data = !IsSaveData();
     /* Write the nodes */
     size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->node_num;
-    TM2_Vector_offsets* v_nodes = ( TM2_Vector_offsets* )malloc(vector_size);
+    TM2_Vector_offsets* v_nodes = (TM2_Vector_offsets*)malloc(vector_size);
     v_nodes->v_num = graph->node_num;
-    for(unsigned int i = 0; i < graph->node_num; i++)
+    for (unsigned int i = 0; i < graph->node_num; i++)
     {
         ir_node_t* p_node = get_ir_graph_node(graph, i);
-        for(unsigned int k = 0; k < p_node->output_num; k++)
+        for (unsigned int k = 0; k < p_node->output_num; k++)
         {
             ir_tensor_t* p_tensor = get_ir_graph_tensor(graph, p_node->output_tensors[k]);
             tensor_ptrs.push_back(p_tensor);
@@ -249,12 +247,12 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra
 
     /* Write the tensors */
     vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num;
-    TM2_Vector_offsets* v_tensors = ( TM2_Vector_offsets* )malloc(vector_size);
+    TM2_Vector_offsets* v_tensors = (TM2_Vector_offsets*)malloc(vector_size);
     v_tensors->v_num = tensor_num;
-    for(unsigned int i = 0; i < tensor_num; i++)
+    for (unsigned int i = 0; i < tensor_num; i++)
     {
         ir_tensor_t* p_tensor = tensor_ptrs[i];
-        if(p_tensor->tensor_type == TENSOR_TYPE_CONST)
+        if (p_tensor->tensor_type == TENSOR_TYPE_CONST)
         {
             // buf_ptrs.push_back(p_tensor->GetMemAddr());
             buf_ptrs.push_back(p_tensor->data); // may cause bug
@@ -269,14 +267,14 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra
 
     /* Write the buffers */
     vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num;
-    TM2_Vector_offsets* v_buffers = ( TM2_Vector_offsets* )malloc(vector_size);
+    TM2_Vector_offsets* v_buffers = (TM2_Vector_offsets*)malloc(vector_size);
     v_buffers->v_num = buffer_num;
-    for(unsigned int i = 0; i < buffer_num; i++)
+    for (unsigned int i = 0; i < buffer_num; i++)
     {
         TM2_Buffer tm_buf;
         tm_buf.size = buf_sizes[i];
 
-        if(tm_no_data)
+        if (tm_no_data)
         {
             /* TM2_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */
             tm_buf.offset_data = TM2_NOT_SET;
@@ -284,8 +282,7 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra
         else
         {
             /* TM2_FOR_BENCHMARK environment variable does not exist */
-            tm_buf.offset_data =
-                WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast<const uint8_t*>(buf_ptrs[i]), tm_buf.size);
+            tm_buf.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast<const uint8_t*>(buf_ptrs[i]), tm_buf.size);
         }
         v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM2_Buffer));
     }
@@ -294,9 +291,9 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra
 
     /* Write the vector of input indices */
     vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_num;
-    TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size);
+    TM2_Vector_indices* v_input_indices = (TM2_Vector_indices*)malloc(vector_size);
     v_input_indices->v_num = graph->input_num;
-    for(unsigned int i = 0; i < graph->input_num; i++)
+    for (unsigned int i = 0; i < graph->input_num; i++)
     {
         v_input_indices->indices[i] = graph->input_nodes[i];
     }
@@ -304,9 +301,9 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, ir_gra
 
     /* Write the vector of output indices */
     vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_num;
-    TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size);
+    TM2_Vector_indices* v_output_indices = (TM2_Vector_indices*)malloc(vector_size);
     v_output_indices->v_num = graph->output_num;
-    for(unsigned int i = 0; i < graph->output_num; i++)
+    for (unsigned int i = 0; i < graph->output_num; i++)
     {
         v_output_indices->indices[i] = graph->output_nodes[i];
     }
@@ -356,7 +353,7 @@ bool SaveModelIntoMem(void* start_ptr, ir_graph_t* graph, uint32_t* tm_model_siz
     /* Write the subgraphs */
     /* Only 1 subgraph is supported currently */
     size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1;
-    TM2_Vector_offsets* v_subgraphs = ( TM2_Vector_offsets* )malloc(vector_size);
+    TM2_Vector_offsets* v_subgraphs = (TM2_Vector_offsets*)malloc(vector_size);
     v_subgraphs->v_num = 1;
     v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph);
 
@@ -382,17 +379,16 @@ int save_model(std::vector<void*>& addr_list, std::vector<int>& size_list, ir_gr
 
     uint32_t malloc_size = TM_FILE_MAX_SIZE;
     const char* env = std::getenv("TM_FILE_MAX_SIZE");
-    if(env)
+    if (env)
         malloc_size = std::atoi(env);
 
-    void* start_ptr = ( void* )malloc(malloc_size);
-    if(start_ptr == nullptr)
+    void* start_ptr = (void*)malloc(malloc_size);
+    if (start_ptr == nullptr)
     {
-        TLOG_ERR("Malloc memory failed: .\n",malloc_size);
+        TLOG_ERR("Malloc memory failed: .\n", malloc_size);
         return false;
     }
 
-
     bool ret = SaveModelIntoMem(start_ptr, graph, &tm_model_size);
 
     addr_list.push_back(start_ptr);
@@ -411,16 +407,16 @@ bool save_graph(graph_t graph, const char* fname)
     ir_graph_t* ir_graph = (ir_graph_t*)graph;
     /* Open the tengine model file */
     int fd = open(fname, O_RDWR | O_CREAT | O_TRUNC, 0666);
-    if(fd == -1)
+    if (fd == -1)
     {
-        TLOG_ERR("Could not open %s\n",fname);
+        TLOG_ERR("Could not open %s\n", fname);
         return false;
     }
 
     std::vector<void*> addr_list;
     std::vector<int> size_list;
 
-    if(!save_model(addr_list, size_list, ir_graph))
+    if (!save_model(addr_list, size_list, ir_graph))
     {
         close(fd);
         return false;
@@ -433,7 +429,7 @@ bool save_graph(graph_t graph, const char* fname)
     close(fd);
     free(buf);
 
-    if(ret != size)
+    if (ret != size)
         return false;
     else
         return true;
diff --git a/tools/convert_tool/utils/save_graph/save_graph.hpp b/tools/convert_tool/utils/save_graph/save_graph.hpp
index 2267e01a5..cdc46b725 100644
--- a/tools/convert_tool/utils/save_graph/save_graph.hpp
+++ b/tools/convert_tool/utils/save_graph/save_graph.hpp
@@ -10,8 +10,7 @@
 #include <fcntl.h>
 #include <functional>
 
-extern "C" 
-{
+extern "C" {
 #include "tengine/c_api.h"
 #include "graph/graph.h"
 #include "graph/subgraph.h"
@@ -22,8 +21,6 @@ extern "C"
 #include "serializer/tmfile/tm2_format.h"
 }
 
-
 #include "tm2_op_save.hpp"
 
-
 bool save_graph(graph_t graph, const char* fname);
diff --git a/tools/convert_tool/utils/save_graph/tm2_generate.c b/tools/convert_tool/utils/save_graph/tm2_generate.c
index 71db31f8b..4ba97d177 100644
--- a/tools/convert_tool/utils/save_graph/tm2_generate.c
+++ b/tools/convert_tool/utils/save_graph/tm2_generate.c
@@ -28,7 +28,7 @@
 extern "C" {
 #endif
 
-#define ALIGN(pos, alignbytes) (((pos) + ( alignbytes )-1) & ~(( alignbytes )-1))
+#define ALIGN(pos, alignbytes) (((pos) + (alignbytes)-1) & ~((alignbytes)-1))
 
 uint32_t WriteTmFileAlign1(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size)
 {
diff --git a/tools/convert_tool/utils/save_graph/tm2_op_save.cpp b/tools/convert_tool/utils/save_graph/tm2_op_save.cpp
index d328c4749..c8b1d200f 100644
--- a/tools/convert_tool/utils/save_graph/tm2_op_save.cpp
+++ b/tools/convert_tool/utils/save_graph/tm2_op_save.cpp
@@ -27,8 +27,6 @@
 // #include "utility/log.h"
 // #include "tengine_ir.h"
 
-
-
 inline void SetTmOperator(TM2_Operator* tm_op, const uint32_t op_type, const tm_uoffset_t offset)
 {
     tm_op->op_ver = TM2_OP_VER;
@@ -272,9 +270,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_n
     TM2_PriorBoxParam tm_param;
 
     size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->min_size_num;
-    TM2_Vector_floats* v_minsizes = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_minsizes = (TM2_Vector_floats*)malloc(vector_size);
     v_minsizes->v_num = p->min_size_num;
-    for(unsigned int i = 0; i < p->min_size_num; i++)
+    for (unsigned int i = 0; i < p->min_size_num; i++)
     {
         v_minsizes->data[i] = p->min_size[i];
     }
@@ -282,9 +280,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_n
     free(v_minsizes);
 
     vector_size = sizeof(tm_size_t) + sizeof(float) * p->max_size_num;
-    TM2_Vector_floats* v_maxsizes = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_maxsizes = (TM2_Vector_floats*)malloc(vector_size);
     v_maxsizes->v_num = p->max_size_num;
-    for(unsigned int i = 0; i < p->max_size_num; i++)
+    for (unsigned int i = 0; i < p->max_size_num; i++)
     {
         v_maxsizes->data[i] = p->max_size[i];
     }
@@ -293,9 +291,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_n
 
     int variance_num = 4; // tengine lite does not set the variable.
     vector_size = sizeof(tm_size_t) + sizeof(float) * variance_num;
-    TM2_Vector_floats* v_variance = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_variance = (TM2_Vector_floats*)malloc(vector_size);
     v_variance->v_num = variance_num;
-    for(unsigned int i = 0; i < variance_num; i++)
+    for (unsigned int i = 0; i < variance_num; i++)
     {
         v_variance->data[i] = p->variance[i];
     }
@@ -303,9 +301,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_n
     free(v_variance);
 
     vector_size = sizeof(tm_size_t) + sizeof(float) * p->aspect_ratio_size;
-    TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_ratios = (TM2_Vector_floats*)malloc(vector_size);
     v_ratios->v_num = p->aspect_ratio_size;
-    for(unsigned int i = 0; i < p->aspect_ratio_size; i++)
+    for (unsigned int i = 0; i < p->aspect_ratio_size; i++)
     {
         v_ratios->data[i] = p->aspect_ratio[i];
     }
@@ -340,9 +338,9 @@ tm_uoffset_t SaveTmRegionOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_nod
     tm_param.nms_threshold = p->nms_threshold;
 
     size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->biases_num;
-    TM2_Vector_floats* v_biases = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_biases = (TM2_Vector_floats*)malloc(vector_size);
     v_biases->v_num = p->biases_num;
-    for(unsigned int i = 0; i < p->biases_num; i++)
+    for (unsigned int i = 0; i < p->biases_num; i++)
     {
         v_biases->data[i] = p->biases[i];
     }
@@ -387,36 +385,35 @@ tm_uoffset_t SaveTmReshapeOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_no
 {
     struct reshape_param* p = (struct reshape_param*)node->op.param_mem;
     TM2_ReshapeParam tm_param;
-    if(p->reverse)
+    if (p->reverse)
         tm_param.reverse = 1;
     else
         tm_param.reverse = 0;
-    if(p->is_mxnet)
+    if (p->is_mxnet)
         tm_param.is_mxnet = 1;
     else
         tm_param.is_mxnet = 0;
 
-    if(p->dim_size)
+    if (p->dim_size)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->dim_size;
-        TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)malloc(vector_size);
         v_re_shape->v_num = p->dim_size;
-        for(unsigned int i = 0; i < p->dim_size; i++)
+        for (unsigned int i = 0; i < p->dim_size; i++)
         {
             v_re_shape->dims[i] = p->re_shape[i];
         }
         tm_param.offset_re_shape = WriteTmObject(start_ptr, cur_pos, v_re_shape, vector_size);
         free(v_re_shape);
     }
-    else{
+    else
+    {
         tm_param.offset_re_shape = TM2_NOT_SET;
     }
 
-
     TM2_Operator tm_op;
     SetTmOperator(&tm_op, TM2_OPTYPE_RESHAPE, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReshapeParam)));
     return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
-
 }
 
 tm_uoffset_t SaveTmResizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t* node)
@@ -453,9 +450,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t
     TM2_RPNParam tm_param;
 
     size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->ratios->elem_num;
-    TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_ratios = (TM2_Vector_floats*)malloc(vector_size);
     v_ratios->v_num = p->ratios->elem_num;
-    for(unsigned int i = 0; i < p->ratios->elem_num; i++)
+    for (unsigned int i = 0; i < p->ratios->elem_num; i++)
     {
         v_ratios->data[i] = *(float*)get_vector_data(p->ratios, i);
     }
@@ -463,9 +460,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t
     free(v_ratios);
 
     vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchor_scales->elem_num;
-    TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_scales = (TM2_Vector_floats*)malloc(vector_size);
     v_scales->v_num = p->anchor_scales->elem_num;
-    for(unsigned int i = 0; i < p->anchor_scales->elem_num; i++)
+    for (unsigned int i = 0; i < p->anchor_scales->elem_num; i++)
     {
         v_scales->data[i] = *(float*)get_vector_data(p->anchor_scales, i);
     }
@@ -473,9 +470,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t
     free(v_scales);
 
     vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchors_->elem_num * 4;
-    TM2_Vector_anchors* v_anchors = ( TM2_Vector_anchors* )malloc(vector_size);
+    TM2_Vector_anchors* v_anchors = (TM2_Vector_anchors*)malloc(vector_size);
     v_anchors->v_num = p->anchors_->elem_num;
-    for(unsigned int i = 0; i < p->anchors_->elem_num; i++)
+    for (unsigned int i = 0; i < p->anchors_->elem_num; i++)
     {
         v_anchors->data[i][0] = ((Anchor_t*)get_vector_data(p->anchors_, i))->x0;
         v_anchors->data[i][1] = ((Anchor_t*)get_vector_data(p->anchors_, i))->y0;
@@ -523,12 +520,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node
     tm_param.isonnx = p->isonnx;
     tm_param.ismxnet = p->ismxnet;
 
-    if(p->slice_point_ && p->slice_point_->elem_num)
+    if (p->slice_point_ && p->slice_point_->elem_num)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->slice_point_->elem_num;
-        TM2_Vector_dims* v_slice_points = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_slice_points = (TM2_Vector_dims*)malloc(vector_size);
         v_slice_points->v_num = p->slice_point_->elem_num;
-        for(unsigned int i = 0; i < p->slice_point_->elem_num; i++)
+        for (unsigned int i = 0; i < p->slice_point_->elem_num; i++)
         {
             v_slice_points->dims[i] = *(int32_t*)get_vector_data(p->slice_point_, i);
         }
@@ -538,12 +535,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node
     else
         tm_param.offset_vi_slice_points = TM2_NOT_SET;
 
-    if(p->begin_ && p->begin_->elem_num)
+    if (p->begin_ && p->begin_->elem_num)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->begin_->elem_num;
-        TM2_Vector_dims* v_begins = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_begins = (TM2_Vector_dims*)malloc(vector_size);
         v_begins->v_num = p->begin_->elem_num;
-        for(unsigned int i = 0; i < p->begin_->elem_num; i++)
+        for (unsigned int i = 0; i < p->begin_->elem_num; i++)
         {
             v_begins->dims[i] = *(int32_t*)get_vector_data(p->begin_, i);
         }
@@ -553,12 +550,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node
     else
         tm_param.offset_vi_begins = TM2_NOT_SET;
 
-    if(p->size_ && p->size_->elem_num)
+    if (p->size_ && p->size_->elem_num)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->size_->elem_num;
-        TM2_Vector_dims* v_sizes = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_sizes = (TM2_Vector_dims*)malloc(vector_size);
         v_sizes->v_num = p->size_->elem_num;
-        for(unsigned int i = 0; i < p->size_->elem_num; i++)
+        for (unsigned int i = 0; i < p->size_->elem_num; i++)
         {
             v_sizes->dims[i] = *(int32_t*)get_vector_data(p->size_, i);
         }
@@ -568,7 +565,6 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node
     else
         tm_param.offset_vi_sizes = TM2_NOT_SET;
 
-
     TM2_Operator tm_op;
     SetTmOperator(&tm_op, TM2_OPTYPE_SLICE, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SliceParam)));
     return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
@@ -589,24 +585,27 @@ tm_uoffset_t SaveTmSplitOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node
 {
     struct split_param* p = (struct split_param*)node->op.param_mem;
     TM2_SplitParam tm_param;
-    if(p->is_caffe)
+    if (p->is_caffe)
         tm_param.is_caffe = 1;
     else
         tm_param.is_caffe = 0;
 
-    if(p->is_onnx){
+    if (p->is_onnx)
+    {
         tm_param.is_onnx = 1;
-    } else {
+    }
+    else
+    {
         tm_param.is_onnx = 0;
     }
-    if(!p->is_caffe)
+    if (!p->is_caffe)
     {
-        if(p->is_onnx)
+        if (p->is_onnx)
             tm_param.axis = p->axis;
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->split_sizes_->elem_num;
-        TM2_Vector_dims* v_split_sizes = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_split_sizes = (TM2_Vector_dims*)malloc(vector_size);
         v_split_sizes->v_num = p->split_sizes_->elem_num;
-        for(unsigned int i = 0; i < p->split_sizes_->elem_num; i++)
+        for (unsigned int i = 0; i < p->split_sizes_->elem_num; i++)
         {
             v_split_sizes->dims[i] = *(int32_t*)get_vector_data(p->split_sizes_, i);
         }
@@ -633,9 +632,9 @@ tm_uoffset_t SaveTmDetectionPostProcessOp(void* const start_ptr, tm_uoffset_t* c
 
     int param_scales_num = 4;
     size_t vector_size = sizeof(tm_size_t) + sizeof(float) * param_scales_num;
-    TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_scales = (TM2_Vector_floats*)malloc(vector_size);
     v_scales->v_num = param_scales_num;
-    for(unsigned int i = 0; i < param_scales_num; i++)
+    for (unsigned int i = 0; i < param_scales_num; i++)
     {
         v_scales->data[i] = p->scales[i];
     }
@@ -777,7 +776,7 @@ tm_uoffset_t SaveTmTopKV2Op(void* const start_ptr, tm_uoffset_t* cur_pos, ir_nod
     TM2_TopKV2Param tm_param;
 
     tm_param.k = p->k;
-    if(p->sorted)
+    if (p->sorted)
         tm_param.sorted = 1;
     else
         tm_param.sorted = 0;
@@ -989,7 +988,7 @@ tm_uoffset_t SaveTmExpanddimsOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir
     struct expanddims_param* p = (struct expanddims_param*)node->op.param_mem;
     TM2_ExpanddimsParam tm_param;
 
-    tm_param.axis= p->axis;
+    tm_param.axis = p->axis;
 
     TM2_Operator tm_op;
     SetTmOperator(&tm_op, TM2_OPTYPE_EXPANDDIMS, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ExpanddimsParam)));
@@ -1170,19 +1169,20 @@ tm_uoffset_t SaveTmTransposeOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_
 {
     struct transpose_param* p = (struct transpose_param*)node->op.param_mem;
     TM2_TransposeParam tm_param;
-    if(p->tr_shape_size)
+    if (p->tr_shape_size)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->tr_shape_size;
-        TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)malloc(vector_size);
         v_re_shape->v_num = p->tr_shape_size;
-        for(unsigned int i = 0; i < p->tr_shape_size; i++)
+        for (unsigned int i = 0; i < p->tr_shape_size; i++)
         {
             v_re_shape->dims[i] = p->tr_shape[i];
         }
         tm_param.offset_tr_shape = WriteTmObject(start_ptr, cur_pos, v_re_shape, vector_size);
         free(v_re_shape);
     }
-    else{
+    else
+    {
         tm_param.offset_tr_shape = TM2_NOT_SET;
     }
     TM2_Operator tm_op;
@@ -1282,12 +1282,12 @@ tm_uoffset_t SaveTmUnsqueezeOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_
     struct unsqueeze_param* p = (struct unsqueeze_param*)node->op.param_mem;
     TM2_UnsqueezeParam tm_param;
 
-    if(p->axises_size)
+    if (p->axises_size)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->axises_size;
-        TM2_Vector_dims* v_axises = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_axises = (TM2_Vector_dims*)malloc(vector_size);
         v_axises->v_num = p->axises_size;
-        for(unsigned int i = 0; i < p->axises_size; i++)
+        for (unsigned int i = 0; i < p->axises_size; i++)
         {
             v_axises->dims[i] = p->axises[i];
         }
@@ -1329,19 +1329,18 @@ tm_uoffset_t SaveTmMatMulOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_nod
     return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
 }
 
-
-tm_uoffset_t SaveTmExpandOp(void* const start_ptr, tm_uoffset_t* cur_pos,ir_node_t* node)
+tm_uoffset_t SaveTmExpandOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t* node)
 {
     struct expand_param* p = (struct expand_param*)node->op.param_mem;
     TM2_ExpandParam tm_param;
     memset(&tm_param, 0, sizeof(TM2_ExpandParam));
 
-    if(p->dim_num)
+    if (p->dim_num)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->dim_num;
-        TM2_Vector_dims* v_axises = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_axises = (TM2_Vector_dims*)malloc(vector_size);
         v_axises->v_num = p->dim_num;
-        for(unsigned int i = 0; i < p->dim_num; i++)
+        for (unsigned int i = 0; i < p->dim_num; i++)
         {
             v_axises->dims[i] = p->ex_shape[i];
         }
@@ -1359,27 +1358,28 @@ tm_uoffset_t SaveTmExpandOp(void* const start_ptr, tm_uoffset_t* cur_pos,ir_node
     return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
 }
 
-tm_uoffset_t SaveTmSpatialTransformerOp(void* const start_ptr, tm_uoffset_t* cur_pos,ir_node_t* node)
+tm_uoffset_t SaveTmSpatialTransformerOp(void* const start_ptr, tm_uoffset_t* cur_pos, ir_node_t* node)
 {
     struct spatialtransformer_param* p = (struct spatialtransformer_param*)node->op.param_mem;
     TM2_SpatialTransformerParam tm_param;
     memset(&tm_param, 0, sizeof(TM2_SpatialTransformerParam));
     tm_param.sampler_type = p->sampler_type;
     tm_param.transformer_type = p->transformer_type;
-    tm_param.shape_size = sizeof(p->target_shape)/sizeof(p->target_shape[0]);
-    if(tm_param.shape_size)
+    tm_param.shape_size = sizeof(p->target_shape) / sizeof(p->target_shape[0]);
+    if (tm_param.shape_size)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * tm_param.shape_size;
-        TM2_Vector_dims* v_ta_shape = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_ta_shape = (TM2_Vector_dims*)malloc(vector_size);
         v_ta_shape->v_num = tm_param.shape_size;
-        for(unsigned int i = 0; i < tm_param.shape_size; i++)
+        for (unsigned int i = 0; i < tm_param.shape_size; i++)
         {
             v_ta_shape->dims[i] = p->target_shape[i];
         }
         tm_param.offset_ta_shape = WriteTmObject(start_ptr, cur_pos, v_ta_shape, vector_size);
         free(v_ta_shape);
     }
-    else{
+    else
+    {
         tm_param.offset_ta_shape = TM2_NOT_SET;
     }
 
@@ -1387,187 +1387,186 @@ tm_uoffset_t SaveTmSpatialTransformerOp(void* const start_ptr, tm_uoffset_t* cur
     memset(&tm_op, 0, sizeof(TM2_Operator));
     SetTmOperator(&tm_op, TM2_OPTYPE_SPATIALTRANSFORMER, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SpatialTransformerParam)));
     return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
-
 }
 
 op_save_t SaveTmOpFunc(uint32_t op_type)
 {
-    switch(op_type)
+    switch (op_type)
     {
-        case OP_BATCHNORM:
-            return SaveTmBatchNormOp;
-        case OP_CONCAT:
-            return SaveTmConcatOp;
-        case OP_CONST:
-            return SaveTmConstOp;
-        case OP_CONV:
-            return SaveTmConvOp;
-        case OP_DECONV:
-            return SaveTmDeconvOp;
-        case OP_DETECTION_OUTPUT:
-            return SaveTmDetectionOutputOp;
-        case OP_DROPOUT:
-            return SaveTmDropoutOp;
-        case OP_ELTWISE:
-            return SaveTmEltwiseOp;
-        case OP_FLATTEN:
-            return SaveTmFlattenOp;
-        case OP_FC:
-            return SaveTmFCOp;
-        case OP_INPUT:
-            return SaveTmInputOp;
-        case OP_LRN:
-            return SaveTmLRNOp;
-        case OP_NORMALIZE:
-            return SaveTmNormalizeOp;
-        case OP_PERMUTE:
-            return SaveTmPermuteOp;
-        case OP_POOL:
-            return SaveTmPoolingOp;
-        case OP_PRELU:
-            return SaveTmPreluOp;
-        case OP_PRIORBOX:
-            return SaveTmPriorBoxOp;
-        case OP_REGION:
-            return SaveTmRegionOp;
-        case OP_RELU:
-            return SaveTmReLuOp;
-        case OP_RELU6:
-            return SaveTmRelu6Op;
-        case OP_REORG:
-            return SaveTmReorgOp;
-        case OP_RESHAPE:
-            return SaveTmReshapeOp;
-        case OP_ROIPOOLING:
-            return SaveTmROIPoolingOp;
-        case OP_RPN:
-            return SaveTmRPNOp;
-        case OP_SCALE:
-            return SaveTmScaleOp;
-        case OP_SLICE:
-            return SaveTmSliceOp;
-        case OP_SOFTMAX:
-            return SaveTmSoftmaxOp;
-        case OP_SPLIT:
-            return SaveTmSplitOp;
-        case OP_DETECTION_POSTPROCESS:
-            return SaveTmDetectionPostProcessOp;
-        case OP_GEMM:
-            return SaveTmGemmOp;
-        case OP_LOGISTIC:
-            return SaveTmLogisticOp;
-        case OP_LSTM:
-            return SaveTmLstmOp;
-        case OP_RNN:
-            return SaveTmRnnOp;
-        case OP_TANH:
-            return SaveTmTanhOp;
-        case OP_SIGMOID:
-            return SaveTmSigmoidOp;
-        case OP_SQUEEZE:
-            return SaveTmSqueezeOp;
-        case OP_SWAP_AXIS:
-            return SaveTmSwapAxisOp;
-        case OP_GRU:
-            return SaveTmGruOp;
-        case OP_ARGMAX:
-            return SaveTmArgMaxOp;
-        case OP_ARGMIN:
-            return SaveTmArgMinOp;
-        case OP_TOPKV2:
-            return SaveTmTopKV2Op;
-        case OP_PAD:
-            return SaveTmPadOp;
-        case OP_STRIDED_SLICE:
-            return SaveTmStridedSliceOp;
-        case OP_REDUCTION:
-            return SaveTmReductionOp;
-        case OP_UPSAMPLE:
-            return SaveTmUpsampleOp;
-        case OP_SHUFFLECHANNEL:
-            return SaveTmShuffleChannelOp;
-        case OP_SPACETOBATCHND:
-            return SaveTmSpaceToBatchNDOp;
-        case OP_BATCHTOSPACEND:
-            return SaveTmBatchToSpaceNDOp;
-        case OP_RESIZE:
-            return SaveTmResizeOp;
-        case OP_CROP:
-            return SaveTmCropOp;
-        case OP_ROIALIGN:
-            return SaveTmRoialignOp;
-        case OP_PSROIPOOLING:
-            return SaveTmPsroipoolingOp;
-        case OP_EXPANDDIMS:
-            return SaveTmExpanddimsOp;
-        case OP_UNARY:
-            return SaveTmUnaryOp;
-        case OP_NOOP:
-            return SaveTmNoopOp;
-        case OP_THRESHOLD:
-            return SaveTmThresholdOp;
-        case OP_HARDSIGMOID:
-            return SaveTmHardsigmoidOp;
-        case OP_EMBEDDING:
-            return SaveTmEmbedOp;
-        case OP_INSTANCENORM:
-            return SaveTmInstanceNormOp;
-        case OP_MVN:
-            return SaveTmMVNOp;
-        case OP_CAST:
-            return SaveTmCastOp;
-        case OP_HARDSWISH:
-            return SaveTmHardSwishOp;
-        case OP_INTERP:
-            return SaveTmInterpOp;
-        case OP_SELU:
-            return SaveTmSeluOp;
-        case OP_ELU:
-            return SaveTmEluOp;
-        case OP_BROADMUL:
-            return SaveTmBroadMulOp;
-        case OP_LOGICAL:
-            return SaveTmLogicalOp;
-        case OP_GATHER:
-            return SaveTmGatherOp;
-        case OP_TRANSPOSE:
-            return SaveTmTransposeOp;
-        case OP_COMPARISON:
-            return SaveTmComparisonOp;
-        case OP_REVERSE:
-            return SaveTmReverseOp;
-        case OP_SPACETODEPTH:
-            return SaveTmSpaceToDepthOp;
-        case OP_DEPTHTOSPACE:
-            return SaveTmDepthToSpaceOp;
-        case OP_SQUAREDDIFFERENCE:
-            return SaveTmSquaredDifferenceOp;
-        case OP_SPARSETODENSE:
-            return SaveTmSparseToDenseOp;
-        case OP_CEIL:
-            return SaveTmCeilOp;
-        case OP_ROUND:
-            return SaveTmRoundOp;
-        case OP_ZEROSLIKE:
-            return SaveTmZerosLikeOp;
-        case OP_CLIP:
-            return SaveTmClipOp;
-        case OP_REDUCEL2:
-            return SaveTmReduceL2Op;
-        case OP_UNSQUEEZE:
-            return SaveTmUnsqueezeOp;
-        case OP_MEAN:
-            return SaveTmMeanOp;
-        case OP_MATMUL:
-            return SaveTmMatMulOp;
-        case OP_MISH:
-            return SaveTmMishOp;
-        case OP_SPATIALTRANSFORMER:
-            return SaveTmSpatialTransformerOp;
-        case OP_EXPAND:
-            return SaveTmExpandOp;
-        default:
-            // fprintf(stderr, "Operator #%d not supported in tengine model yet\n",op_type);
-            return nullptr;
+    case OP_BATCHNORM:
+        return SaveTmBatchNormOp;
+    case OP_CONCAT:
+        return SaveTmConcatOp;
+    case OP_CONST:
+        return SaveTmConstOp;
+    case OP_CONV:
+        return SaveTmConvOp;
+    case OP_DECONV:
+        return SaveTmDeconvOp;
+    case OP_DETECTION_OUTPUT:
+        return SaveTmDetectionOutputOp;
+    case OP_DROPOUT:
+        return SaveTmDropoutOp;
+    case OP_ELTWISE:
+        return SaveTmEltwiseOp;
+    case OP_FLATTEN:
+        return SaveTmFlattenOp;
+    case OP_FC:
+        return SaveTmFCOp;
+    case OP_INPUT:
+        return SaveTmInputOp;
+    case OP_LRN:
+        return SaveTmLRNOp;
+    case OP_NORMALIZE:
+        return SaveTmNormalizeOp;
+    case OP_PERMUTE:
+        return SaveTmPermuteOp;
+    case OP_POOL:
+        return SaveTmPoolingOp;
+    case OP_PRELU:
+        return SaveTmPreluOp;
+    case OP_PRIORBOX:
+        return SaveTmPriorBoxOp;
+    case OP_REGION:
+        return SaveTmRegionOp;
+    case OP_RELU:
+        return SaveTmReLuOp;
+    case OP_RELU6:
+        return SaveTmRelu6Op;
+    case OP_REORG:
+        return SaveTmReorgOp;
+    case OP_RESHAPE:
+        return SaveTmReshapeOp;
+    case OP_ROIPOOLING:
+        return SaveTmROIPoolingOp;
+    case OP_RPN:
+        return SaveTmRPNOp;
+    case OP_SCALE:
+        return SaveTmScaleOp;
+    case OP_SLICE:
+        return SaveTmSliceOp;
+    case OP_SOFTMAX:
+        return SaveTmSoftmaxOp;
+    case OP_SPLIT:
+        return SaveTmSplitOp;
+    case OP_DETECTION_POSTPROCESS:
+        return SaveTmDetectionPostProcessOp;
+    case OP_GEMM:
+        return SaveTmGemmOp;
+    case OP_LOGISTIC:
+        return SaveTmLogisticOp;
+    case OP_LSTM:
+        return SaveTmLstmOp;
+    case OP_RNN:
+        return SaveTmRnnOp;
+    case OP_TANH:
+        return SaveTmTanhOp;
+    case OP_SIGMOID:
+        return SaveTmSigmoidOp;
+    case OP_SQUEEZE:
+        return SaveTmSqueezeOp;
+    case OP_SWAP_AXIS:
+        return SaveTmSwapAxisOp;
+    case OP_GRU:
+        return SaveTmGruOp;
+    case OP_ARGMAX:
+        return SaveTmArgMaxOp;
+    case OP_ARGMIN:
+        return SaveTmArgMinOp;
+    case OP_TOPKV2:
+        return SaveTmTopKV2Op;
+    case OP_PAD:
+        return SaveTmPadOp;
+    case OP_STRIDED_SLICE:
+        return SaveTmStridedSliceOp;
+    case OP_REDUCTION:
+        return SaveTmReductionOp;
+    case OP_UPSAMPLE:
+        return SaveTmUpsampleOp;
+    case OP_SHUFFLECHANNEL:
+        return SaveTmShuffleChannelOp;
+    case OP_SPACETOBATCHND:
+        return SaveTmSpaceToBatchNDOp;
+    case OP_BATCHTOSPACEND:
+        return SaveTmBatchToSpaceNDOp;
+    case OP_RESIZE:
+        return SaveTmResizeOp;
+    case OP_CROP:
+        return SaveTmCropOp;
+    case OP_ROIALIGN:
+        return SaveTmRoialignOp;
+    case OP_PSROIPOOLING:
+        return SaveTmPsroipoolingOp;
+    case OP_EXPANDDIMS:
+        return SaveTmExpanddimsOp;
+    case OP_UNARY:
+        return SaveTmUnaryOp;
+    case OP_NOOP:
+        return SaveTmNoopOp;
+    case OP_THRESHOLD:
+        return SaveTmThresholdOp;
+    case OP_HARDSIGMOID:
+        return SaveTmHardsigmoidOp;
+    case OP_EMBEDDING:
+        return SaveTmEmbedOp;
+    case OP_INSTANCENORM:
+        return SaveTmInstanceNormOp;
+    case OP_MVN:
+        return SaveTmMVNOp;
+    case OP_CAST:
+        return SaveTmCastOp;
+    case OP_HARDSWISH:
+        return SaveTmHardSwishOp;
+    case OP_INTERP:
+        return SaveTmInterpOp;
+    case OP_SELU:
+        return SaveTmSeluOp;
+    case OP_ELU:
+        return SaveTmEluOp;
+    case OP_BROADMUL:
+        return SaveTmBroadMulOp;
+    case OP_LOGICAL:
+        return SaveTmLogicalOp;
+    case OP_GATHER:
+        return SaveTmGatherOp;
+    case OP_TRANSPOSE:
+        return SaveTmTransposeOp;
+    case OP_COMPARISON:
+        return SaveTmComparisonOp;
+    case OP_REVERSE:
+        return SaveTmReverseOp;
+    case OP_SPACETODEPTH:
+        return SaveTmSpaceToDepthOp;
+    case OP_DEPTHTOSPACE:
+        return SaveTmDepthToSpaceOp;
+    case OP_SQUAREDDIFFERENCE:
+        return SaveTmSquaredDifferenceOp;
+    case OP_SPARSETODENSE:
+        return SaveTmSparseToDenseOp;
+    case OP_CEIL:
+        return SaveTmCeilOp;
+    case OP_ROUND:
+        return SaveTmRoundOp;
+    case OP_ZEROSLIKE:
+        return SaveTmZerosLikeOp;
+    case OP_CLIP:
+        return SaveTmClipOp;
+    case OP_REDUCEL2:
+        return SaveTmReduceL2Op;
+    case OP_UNSQUEEZE:
+        return SaveTmUnsqueezeOp;
+    case OP_MEAN:
+        return SaveTmMeanOp;
+    case OP_MATMUL:
+        return SaveTmMatMulOp;
+    case OP_MISH:
+        return SaveTmMishOp;
+    case OP_SPATIALTRANSFORMER:
+        return SaveTmSpatialTransformerOp;
+    case OP_EXPAND:
+        return SaveTmExpandOp;
+    default:
+        // fprintf(stderr, "Operator #%d not supported in tengine model yet\n",op_type);
+        return nullptr;
     }
 }
diff --git a/tools/convert_tool/utils/save_graph/tm2_op_save.hpp b/tools/convert_tool/utils/save_graph/tm2_op_save.hpp
index 7eafbe566..79456b646 100644
--- a/tools/convert_tool/utils/save_graph/tm2_op_save.hpp
+++ b/tools/convert_tool/utils/save_graph/tm2_op_save.hpp
@@ -3,15 +3,13 @@
 
 #include <functional>
 extern "C" {
-    #include "utility/vector.h"
-    #include "serializer/tmfile/tm2_format.h"
-    #include "tm2_generate.h"
-    #include "graph/node.h"
-    
-    #include "op_include.h"
-}
-
+#include "utility/vector.h"
+#include "serializer/tmfile/tm2_format.h"
+#include "tm2_generate.h"
+#include "graph/node.h"
 
+#include "op_include.h"
+}
 
 using op_save_t = std::function<tm_uoffset_t(void* const, tm_uoffset_t*, ir_node_t*)>;
 op_save_t SaveTmOpFunc(uint32_t op_type);
diff --git a/tools/quantize/compiler_fp16.h b/tools/quantize/compiler_fp16.h
index 1857d7eec..d770707c2 100644
--- a/tools/quantize/compiler_fp16.h
+++ b/tools/quantize/compiler_fp16.h
@@ -48,7 +48,7 @@ extern "C" {
 
 #else
 #ifdef _MSC_VER
-#pragma  pack (push,1)
+#pragma pack(push, 1)
 struct fp16_pack
 {
     unsigned short frac : 10;
@@ -84,12 +84,12 @@ typedef struct fp16_pack __fp16;
 static inline float fp16_to_fp32(__fp16 data)
 {
     float f;
-    struct fp32_pack* fp32 = ( struct fp32_pack* )&f;
+    struct fp32_pack* fp32 = (struct fp32_pack*)&f;
     struct fp16_pack* fp16 = &data;
 
     int exp = fp16->exp;
 
-    if(exp == 31 && fp16->frac != 0)
+    if (exp == 31 && fp16->frac != 0)
     {
         // return __builtin_inf()-__builtin_inf();
         fp32->sign = fp16->sign;
@@ -99,28 +99,28 @@ static inline float fp16_to_fp32(__fp16 data)
         return f;
     }
 
-    if(exp == 31)
+    if (exp == 31)
         exp = 255;
-    if(exp == 0)
+    if (exp == 0)
         exp = 0;
     else
         exp = (exp - 15) + 127;
 
     fp32->exp = exp;
     fp32->sign = fp16->sign;
-    fp32->frac = (( int )fp16->frac) << 13;
+    fp32->frac = ((int)fp16->frac) << 13;
 
     return f;
 }
 
 static inline __fp16 fp32_to_fp16(float data)
 {
-    struct fp32_pack* fp32 = ( struct fp32_pack* )&data;
+    struct fp32_pack* fp32 = (struct fp32_pack*)&data;
     struct fp16_pack fp16;
 
     int exp = fp32->exp;
 
-    if(fp32->exp == 255 && fp32->frac != 0)
+    if (fp32->exp == 255 && fp32->frac != 0)
     {
         // NaN
         fp16.exp = 31;
@@ -130,9 +130,9 @@ static inline __fp16 fp32_to_fp16(float data)
         return fp16;
     }
 
-    if((exp - 127) < -14)
+    if ((exp - 127) < -14)
         exp = 0;
-    else if((exp - 127) > 15)
+    else if ((exp - 127) > 15)
         exp = 31;
     else
         exp = exp - 127 + 15;
diff --git a/tools/quantize/quant_save_graph.cpp b/tools/quantize/quant_save_graph.cpp
index 0f8918fda..f53705076 100644
--- a/tools/quantize/quant_save_graph.cpp
+++ b/tools/quantize/quant_save_graph.cpp
@@ -1,1025 +1,1019 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: hhchen@openailab.com
- */
-
-
-#include <algorithm>
-
-#include "quant_save_graph.hpp"
-#include "compiler_fp16.h"
-
-#include "operator/prototype/convolution_param.h"
-#include "operator/prototype/pooling_param.h"
-#include "operator/prototype/relu_param.h"
-
-
-void recursion_pass_through(struct graph* ir_graph, const char* layer_name, struct tensor* t,
-                            std::tr1::unordered_map<std::string, int> &layer_used, std::tr1::unordered_map<std::string, float> &layer_scale,
-                            std::tr1::unordered_map<std::string, float> &layer_zeropoint, std::tr1::unordered_map<std::string, bool> &layer_pass)
-{
-    if (layer_pass[t->name] == false && layer_used[t->name] < 2)
-    {
-        t->scale = layer_scale[layer_name];
-        t->zero_point = layer_zeropoint[layer_name];
-        layer_scale[t->name] = layer_scale[layer_name];
-        layer_zeropoint[t->name] = layer_zeropoint[layer_name];
-
-        uint32_t ir_node_idx = t->producer;
-        struct node* t_node = ir_graph->node_list[ir_node_idx];
-
-        std::string op_name = get_op_name_from_type(t_node->op.type);
-        bool poolTrue = false;
-        bool reluTrue = false;
-        if (op_name == "Pooling")
-        {
-            struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem;
-            if (pool_param->pool_method == 0)
-                poolTrue = true;
-        }
-        else if (op_name == "ReLU")
-        {
-            struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem;
-            if (relu_param->negative_slope == 0.f)
-                reluTrue = true;
-        }
-        if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" ||
-            poolTrue || reluTrue)
-        {
-            struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]];
-            if (layer_scale[t->name] != 0)
-            {
-                if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT)
-                {
-                    recursion_pass_through(ir_graph, t->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass);
-                }
-            }
-        }
-        layer_pass[t->name] = true;
-    }
-}
-
-int save_graph_u8_perlayer(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal)
-{
-    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n");
-
-    /* Step 1 : create graph, load tengine model xxx.tmfile */
-    struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file);
-    if (nullptr == ir_graph)
-    {
-        fprintf(stderr, "Create graph failed.\n");
-        return -1;
-    }
-    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n");
-
-    std::tr1::unordered_map<std::string,float> layer_scale;
-    std::tr1::unordered_map<std::string,float> layer_zeropoint;
-
-    fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file);
-    /* Step 2 : set activation quant scale value into ir_tensor */
-    if (nullptr != scale_file)
-    {
-        std::ifstream scales(scale_file);
-        std::string line;
-        while (std::getline(scales, line))
-        {
-            std::string layer_name;
-            float scale_val = 0.f;
-            float zero_point = 0.f;
-            size_t last = 0;
-            size_t index = line.find_first_of(' ', last);
-            size_t idx = line.find_last_of(' ', line.size());
-            layer_name = line.substr(last, index - last);
-            last = index + 1;
-            scale_val = atof((line.substr(last, line.size() - last)).c_str());
-            zero_point = atof((line.substr(idx + 1, line.size())).c_str());
-
-            layer_scale[layer_name] = scale_val;
-            layer_zeropoint[layer_name] = zero_point;
-
-//            fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point);
-        }
-    }
-
-    std::tr1::unordered_map<std::string,int> layer_used;
-    for (int i = 0; i < ir_graph->node_num; i++)
-    {
-        struct node* ir_node = ir_graph->node_list[i];
-        for (int j = 0; j < ir_node->input_num; j++ )
-        {
-            std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name;
-            layer_used[layern] ++;
-        }
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n");
-    /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */
-    if (inplace == 0)
-    {
-        for (int i = 0; i < ir_graph->tensor_num; i++)
-        {
-            struct tensor* ir_tensor = ir_graph->tensor_list[i];
-            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-            {
-                ir_tensor->scale      = layer_scale[ir_tensor->name];
-                ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
-            }
-        }
-    }
-    else
-    {
-        std::tr1::unordered_map<std::string, bool> layer_pass;
-        for (int i = ir_graph->tensor_num-1; i >= 0; i--)
-        {
-            struct tensor* ir_tensor = ir_graph->tensor_list[i];
-            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-            {
-                if (layer_pass[ir_tensor->name] == false)
-                {
-                    uint32_t ir_node_idx = ir_tensor->producer;
-                    struct node* t_node = ir_graph->node_list[ir_node_idx];
-
-                    std::string op_name = get_op_name_from_type(t_node->op.type);
-
-                    bool poolTrue = false;
-                    bool reluTrue = false;
-                    if (op_name == "Pooling")
-                    {
-                        struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem;
-                        if (pool_param->pool_method == 0)
-                            poolTrue = true;
-                    }
-                    else if (op_name == "ReLU")
-                    {
-                        struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem;
-                        if (relu_param->negative_slope == 0.f)
-                            reluTrue = true;
-                    }
-
-                    if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" ||
-                        op_name == "Slice" || poolTrue || reluTrue)
-                    {
-                        struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]];
-                        if (layer_scale[ir_tensor->name] != 0)
-                        {
-                            ir_tensor->scale      = layer_scale[ir_tensor->name];
-                            ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
-
-                            if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT)
-                            {
-                                recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        ir_tensor->scale = layer_scale[ir_tensor->name];
-                        ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
-                    }
-                    layer_pass[ir_tensor->name] = true;
-                }
-            }
-        }
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n");
-
-    /* Set the params of acitvation ir_tensor */
-    for (int i = 0; i < ir_graph->tensor_num; i++)
-    {
-        struct tensor* ir_tensor = ir_graph->tensor_list[i];
-        if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-        {
-            ir_tensor->data_type = TENGINE_DT_UINT8;
-            ir_tensor->elem_size = sizeof(uint8_t);
-        }
-        ir_tensor->quant_param_num = 1;
-    }
-
-    /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */
-    for (int i = 0; i < ir_graph->node_num; i++)
-    {
-        struct node* noden = ir_graph->node_list[i];
-        std::string op_name = get_op_name_from_type(noden->op.type);
-
-        /* quantize the tensor data from fp32 to uint8 */
-        if (op_name == "Convolution" || op_name == "FullyConnected" || op_name == "Deconvolution")
-        {
-            /* Step 3.1 : quant weight */
-            struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]];
-
-            uint8_t * u8_weight_data = (uint8_t*)sys_malloc(weight_tensor->elem_num * sizeof(uint8_t));
-            float* weight_data       = (float*)weight_tensor->data;
-
-            /* calculate the quant scale value of weight perchannel, scale = (min-max / 255) */
-            float weight_max = 0;
-            float weight_min = 0;
-            float weight_scale = 0;
-            int weight_zero_point = 0;
-
-            if (internal)
-            {
-                weight_scale = weight_tensor->scale;
-                weight_zero_point = weight_tensor->zero_point;
-            }
-            else
-            {
-                weight_max = *std::max_element(weight_data, weight_data + weight_tensor->elem_num);
-                weight_min = *std::min_element(weight_data, weight_data + weight_tensor->elem_num);
-                weight_scale = (weight_max - weight_min) / 255.f;
-                weight_zero_point = int(-weight_min/weight_scale);
-            }
-//            fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point);
-
-            /* quantize the value of weight from Float32 to UInt8, value_u8 = (value_fp32 / scale).round().clip(0, 255) */
-            for (int wi = 0; wi < weight_tensor->elem_num; wi++)
-            {
-                weight_data[wi] = roundf(weight_data[wi] / weight_scale + (float )weight_zero_point);
-                weight_data[wi] = weight_data[wi] > 255.f ? 255.f : weight_data[wi];
-                weight_data[wi] = weight_data[wi] < 0.f   ?   0.f : weight_data[wi];
-                u8_weight_data[wi] = uint8_t(weight_data[wi]);
-            }
-
-            weight_tensor->scale = weight_scale;
-            weight_tensor->zero_point = weight_zero_point;
-            weight_tensor->data_type = TENGINE_DT_UINT8;
-            weight_tensor->elem_size = sizeof(uint8_t);
-            weight_tensor->data = u8_weight_data;
-
-            /* step 3.2 : quant bias */
-            if (noden->input_num > 2)
-            {
-                struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]];
-                struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]];
-
-                int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * bias_tensor->elem_size);
-                float* bias_data     = (float*)bias_tensor->data;
-
-                /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */
-                float bias_scale = input_tensor->scale * weight_tensor->scale;
-
-                /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */
-                for (int bi = 0; bi < bias_tensor->elem_num; bi++)
-                {
-                    if (bias_scale == 0)
-                        int32_bias_data[bi] = 0;
-                    else
-                    {
-                        bias_data[bi] = roundf(bias_data[bi] / bias_scale);
-                        int32_bias_data[bi] = int(bias_data[bi]);
-                    }
-                }
-
-                bias_tensor->scale = bias_scale;
-                bias_tensor->data_type = TENGINE_DT_INT32;
-                bias_tensor->data = int32_bias_data;
-
-//                fprintf(stderr, "[bias]   scale final %8.4f\n", bias_scale);
-            }
-        }
-        /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */
-        else if (op_name == "PReLU")
-        {
-            for (int j = 0; j < noden->input_num; j++)
-            {
-                struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]];
-                if (in_tensor->tensor_type == TENSOR_TYPE_CONST)
-                {
-                    float* fp32_data =  (float*) in_tensor->data;
-                    int data_elem =  in_tensor->elem_num;
-
-                    __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16));
-
-                    for (int k = 0; k < data_elem; k++)
-                    {
-                        fp16_data[k] = fp32_to_fp16(fp32_data[k]);
-                    }
-
-                    in_tensor->data_type = TENGINE_DT_FP16;
-                    in_tensor->data = fp16_data;
-                    in_tensor->quant_param_num = 0;
-                }
-            }
-        }
-        else if (op_name == "Slice")
-        {
-            struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]);
-            struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
-            slice_output_tensor->scale = slice_input_tensor->scale;
-            slice_output_tensor->zero_point = slice_input_tensor->zero_point;
-        }
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n");
-
-    if (!save_graph(ir_graph, output_file.c_str()))
-    {
-        fprintf(stderr, "save graph failed.\n");
-        return -1;
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 6, save UInt8 tmfile done, %s\n", output_file.c_str());
-
-    return 0;
-}
-
-int save_graph_i8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal)
-{
-    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n");
-
-    /* Step 1 : create graph, load tengine model xxx.tmfile */
-    struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file);
-    if (nullptr == ir_graph)
-    {
-        fprintf(stderr, "Create graph failed.\n");
-        return -1;
-    }
-    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n");
-
-    std::tr1::unordered_map<std::string,float> layer_scale;
-    std::tr1::unordered_map<std::string,float> layer_zeropoint;
-
-    fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file);
-    /* Step 2 : set activation quant scale value into ir_tensor */
-    if (nullptr != scale_file)
-    {
-        std::ifstream scales(scale_file);
-        std::string line;
-        while (std::getline(scales, line))
-        {
-            std::string layer_name;
-            float scale_val = 0.f;
-            float zero_point = 0.f;
-            size_t last = 0;
-            size_t index = line.find_first_of(' ', last);
-            size_t idx = line.find_last_of(' ', line.size());
-            layer_name = line.substr(last, index - last);
-            last = index + 1;
-            scale_val = atof((line.substr(last, line.size() - last)).c_str());
-            zero_point = atof((line.substr(idx + 1, line.size())).c_str());
-
-            layer_scale[layer_name] = scale_val;
-            layer_zeropoint[layer_name] = zero_point;
-
-//            fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point);
-        }
-    }
-
-    std::tr1::unordered_map<std::string,int> layer_used;
-    for (int i = 0; i < ir_graph->node_num; i++)
-    {
-        struct node* ir_node = ir_graph->node_list[i];
-        for (int j = 0; j < ir_node->input_num; j++ )
-        {
-            std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name;
-            layer_used[layern] ++;
-        }
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n");
-    /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */
-    if (inplace == 0)
-    {
-        for (int i = 0; i < ir_graph->tensor_num; i++)
-        {
-            struct tensor* ir_tensor = ir_graph->tensor_list[i];
-            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-            {
-                ir_tensor->scale      = layer_scale[ir_tensor->name];
-                ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
-            }
-        }
-    }
-    else
-    {
-        std::tr1::unordered_map<std::string, bool> layer_pass;
-        for (int i = ir_graph->tensor_num-1; i >= 0; i--)
-        {
-            struct tensor* ir_tensor = ir_graph->tensor_list[i];
-            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-            {
-                if (layer_pass[ir_tensor->name] == false)
-                {
-                    uint32_t ir_node_idx = ir_tensor->producer;
-                    struct node* t_node = ir_graph->node_list[ir_node_idx];
-
-                    std::string op_name = get_op_name_from_type(t_node->op.type);
-
-                    bool poolTrue = false;
-                    bool reluTrue = false;
-                    if (op_name == "Pooling")
-                    {
-                        struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem;
-                        if (pool_param->pool_method == 0)
-                            poolTrue = true;
-                    }
-                    else if (op_name == "ReLU")
-                    {
-                        struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem;
-                        if (relu_param->negative_slope == 0.f)
-                            reluTrue = true;
-                    }
-
-                    if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" ||
-                        op_name == "Slice" || poolTrue || reluTrue)
-                    {
-                        struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]];
-                        if (layer_scale[ir_tensor->name] != 0)
-                        {
-                            ir_tensor->scale      = layer_scale[ir_tensor->name];
-                            ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
-
-                            if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT)
-                            {
-                                recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        ir_tensor->scale = layer_scale[ir_tensor->name];
-                        ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
-                    }
-                    layer_pass[ir_tensor->name] = true;
-                }
-            }
-        }
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n");
-
-    /* Set the params of acitvation ir_tensor */
-    for (int i = 0; i < ir_graph->tensor_num; i++)
-    {
-        struct tensor* ir_tensor = ir_graph->tensor_list[i];
-        if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-        {
-            ir_tensor->data_type = TENGINE_DT_INT8;
-            ir_tensor->elem_size = sizeof(int8_t);
-        }
-        ir_tensor->quant_param_num = 1;
-    }
-
-    /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */
-    FILE* fp_weight = fopen("scale_weight.txt", "wb");
-    FILE* fp_bias   = fopen("scale_bias.txt", "wb");
-    for (int i = 0; i < ir_graph->node_num; i++)
-    {
-        struct node* noden = ir_graph->node_list[i];
-        std::string op_name = get_op_name_from_type(noden->op.type);
-
-        /* quantize the tensor data from fp32 to uint8 */
-        if (op_name == "Convolution" || op_name == "FullyConnected" || op_name == "Deconvolution")
-        {
-            /* Step 3.1 : quant weight */
-            struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]];
-
-            int channel_num = weight_tensor->dims[0];
-            int cstep = int(weight_tensor->elem_num / channel_num);
-            float* weight_data = ( float* )weight_tensor->data;
-            int8_t* i8_weight_data = ( int8_t* )sys_malloc(weight_tensor->elem_num * sizeof(int8_t));
-
-            float* weight_scale_list = ( float* )sys_malloc(channel_num * sizeof(float));
-            int* weight_zp_list = ( int* )sys_malloc(channel_num * sizeof(int));
-
-            fprintf(fp_weight, "%s ", weight_tensor->name);
-            /* calculate the quant scale value of weight perchannel, scale = abs(min, max) / 127 */
-            if (internal)
-            {
-                // TODO
-            }
-            else
-            {
-                for (int ch = 0; ch < channel_num; ch++)
-                {
-                    float* weight_data_ch_start = weight_data + ch * cstep;
-                    float* weight_data_ch_end   = weight_data + (ch + 1) * cstep;
-                    float weight_max = *std::max_element(weight_data_ch_start, weight_data_ch_end);
-                    float weight_min = *std::min_element(weight_data_ch_start, weight_data_ch_end);
-
-                    weight_scale_list[ch] = std::max(abs(weight_max), abs(weight_min)) / 127.f;
-                    weight_zp_list[ch] = 0;
-                    fprintf(fp_weight, "%8.8f ", weight_scale_list[ch]);
-                }
-                fprintf(fp_weight, "\n");
-            }
-//            fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point);
-
-            /* quantize the value of weight from Float32 to Int8, value_i8 = (value_fp32 / scale).round().clip(-127, 127) */
-            for (int ch = 0; ch < channel_num; ch++)
-            {
-                for (int j = 0; j < cstep; j++)
-                {
-                    if (weight_data[ch * cstep + j] == 0 || weight_scale_list[ch] == 0)
-                        i8_weight_data[ch * cstep + j] = 0;
-                    else
-                    {
-                        float int8_data = round(weight_data[ch * cstep + j] / weight_scale_list[ch]);
-                        int8_data = int8_data >  127.f ?  127.f : int8_data;
-                        int8_data = int8_data < -127.f ? -127.f : int8_data;
-                        i8_weight_data[ch * cstep + j] = int8_t(int8_data);
-                    }
-                }
-            }
-
-            weight_tensor->scale_list = weight_scale_list;
-            weight_tensor->zp_list = weight_zp_list;
-            weight_tensor->data_type = TENGINE_DT_INT8;
-            weight_tensor->elem_size = sizeof(int8_t); // int8, signed char
-            weight_tensor->data = i8_weight_data;
-            weight_tensor->quant_param_num = channel_num;
-
-            /* step 3.2 : quant bias */
-            if (noden->input_num > 2)
-            {
-                struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]];
-                struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]];
-
-                float* bias_scale_list = ( float* )sys_malloc(bias_tensor->dims[0] * sizeof(float));
-                int* bias_zp_list = ( int* )sys_malloc(bias_tensor->dims[0] * sizeof(int32_t));
-
-                float* bias_data = ( float* )bias_tensor->data;
-                int* int32_bias_data = ( int* )sys_malloc(bias_tensor->elem_num * sizeof(int32_t));
-
-                int bstep = int(bias_tensor->elem_num / channel_num);
-
-                fprintf(fp_bias, "%s ", bias_tensor->name);
-
-                /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */
-                for (int ch = 0; ch < channel_num; ch++)
-                {
-                    bias_scale_list[ch] = weight_scale_list[ch] * input_tensor->scale;
-                    bias_zp_list[ch] = 0;
-
-                    fprintf(fp_bias, "%8.8f ", bias_scale_list[ch]);
-                }
-                fprintf(fp_bias, "\n");
-
-                /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */
-                for (int ch = 0; ch < channel_num; ch++)
-                {
-                    for (int bi = 0; bi < bstep; bi++)
-                    {
-                        if (bias_data[ch * bstep + bi] == 0 || bias_scale_list[ch] == 0)
-                            int32_bias_data[ch * bstep + bi] = 0;
-                        else
-                            int32_bias_data[ch * bstep + bi] = int(round(bias_data[ch * bstep + bi] / bias_scale_list[ch]));
-                    }
-                }
-
-                bias_tensor->scale_list = bias_scale_list;
-                bias_tensor->zp_list    = bias_zp_list;
-                bias_tensor->data_type  = TENGINE_DT_INT32;
-                bias_tensor->elem_size  = sizeof(int32_t); // int32, signed int
-                bias_tensor->data = int32_bias_data;
-                bias_tensor->quant_param_num = channel_num;
-
-                // fprintf(stderr, "bias   %8.8f \t%s\n", bias_scale_list[0], bias_tensor->name);
-            }
-            // fprintf(stderr, "\n");
-        }
-        /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */
-        else if (op_name == "PReLU")
-        {
-            for (int j = 0; j < noden->input_num; j++)
-            {
-                struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]];
-                if (in_tensor->tensor_type == TENSOR_TYPE_CONST)
-                {
-                    float* fp32_data =  (float*) in_tensor->data;
-                    int data_elem =  in_tensor->elem_num;
-
-                    __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16));
-
-                    for (int k = 0; k < data_elem; k++)
-                    {
-                        fp16_data[k] = fp32_to_fp16(fp32_data[k]);
-                    }
-
-                    in_tensor->data_type = TENGINE_DT_FP16;
-                    in_tensor->data = fp16_data;
-                    in_tensor->quant_param_num = 0;
-                }
-            }
-        }
-        else if (op_name == "Slice")
-        {
-            struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]);
-            struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
-            slice_output_tensor->scale = slice_input_tensor->scale;
-            slice_output_tensor->zero_point = slice_input_tensor->zero_point;
-        }
-    }
-
-    fclose(fp_weight);
-    fclose(fp_bias);    
-
-    fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n");
-
-    if (!save_graph(ir_graph, output_file.c_str()))
-    {
-        fprintf(stderr, "save graph failed.\n");
-        return -1;
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 6, save Int8 tmfile done, %s\n", output_file.c_str());
-
-    return 0;
-}
-
-int save_graph_u8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal)
-{
-    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n");
-
-    /* Step 1 : create graph, load tengine model xxx.tmfile */
-    struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file);
-    if (nullptr == ir_graph)
-    {
-        fprintf(stderr, "Create graph failed.\n");
-        return -1;
-    }
-    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n");
-
-    std::tr1::unordered_map<std::string,float> layer_scale;
-    std::tr1::unordered_map<std::string,float> layer_zeropoint;
-
-    fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file);
-    /* Step 2 : set activation quant scale value into ir_tensor */
-    if (nullptr != scale_file)
-    {
-        std::ifstream scales(scale_file);
-        std::string line;
-        while (std::getline(scales, line))
-        {
-            std::string layer_name;
-            float scale_val = 0.f;
-            float zero_point = 0.f;
-            size_t last = 0;
-            size_t index = line.find_first_of(' ', last);
-            size_t idx = line.find_last_of(' ', line.size());
-            layer_name = line.substr(last, index - last);
-            last = index + 1;
-            scale_val = atof((line.substr(last, line.size() - last)).c_str());
-            zero_point = atof((line.substr(idx + 1, line.size())).c_str());
-
-            layer_scale[layer_name] = scale_val;
-            layer_zeropoint[layer_name] = zero_point;
-
-//            fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point);
-        }
-    }
-
-    std::tr1::unordered_map<std::string,int> layer_used;
-    for (int i = 0; i < ir_graph->node_num; i++)
-    {
-        struct node* ir_node = ir_graph->node_list[i];
-        for (int j = 0; j < ir_node->input_num; j++ )
-        {
-            std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name;
-            layer_used[layern] ++;
-        }
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n");
-    /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */
-    if (inplace == 0)
-    {
-        for (int i = 0; i < ir_graph->tensor_num; i++)
-        {
-            struct tensor* ir_tensor = ir_graph->tensor_list[i];
-            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-            {
-                ir_tensor->scale      = layer_scale[ir_tensor->name];
-                ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
-            }
-        }
-    }
-    else
-    {
-        std::tr1::unordered_map<std::string, bool> layer_pass;
-        for (int i = ir_graph->tensor_num-1; i >= 0; i--)
-        {
-            struct tensor* ir_tensor = ir_graph->tensor_list[i];
-            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-            {
-                if (layer_pass[ir_tensor->name] == false)
-                {
-                    uint32_t ir_node_idx = ir_tensor->producer;
-                    struct node* t_node = ir_graph->node_list[ir_node_idx];
-
-                    std::string op_name = get_op_name_from_type(t_node->op.type);
-
-                    bool poolTrue = false;
-                    bool reluTrue = false;
-                    if (op_name == "Pooling")
-                    {
-                        struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem;
-                        if (pool_param->pool_method == 0)
-                            poolTrue = true;
-                    }
-                    else if (op_name == "ReLU")
-                    {
-                        struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem;
-                        if (relu_param->negative_slope == 0.f)
-                            reluTrue = true;
-                    }
-
-                    if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" ||
-                        op_name == "Slice" || poolTrue || reluTrue)
-                    {
-                        struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]];
-                        if (layer_scale[ir_tensor->name] != 0)
-                        {
-                            ir_tensor->scale      = layer_scale[ir_tensor->name];
-                            ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
-
-                            if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT)
-                            {
-                                recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        ir_tensor->scale = layer_scale[ir_tensor->name];
-                        ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
-                    }
-                    layer_pass[ir_tensor->name] = true;
-                }
-            }
-        }
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n");
-
-    /* Set the params of acitvation ir_tensor */
-    for (int i = 0; i < ir_graph->tensor_num; i++)
-    {
-        struct tensor* ir_tensor = ir_graph->tensor_list[i];
-        if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-        {
-            ir_tensor->data_type = TENGINE_DT_UINT8;
-            ir_tensor->elem_size = sizeof(uint8_t);
-        }
-        ir_tensor->quant_param_num = 1;
-    }
-
-    /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */
-    FILE* fp_weight = fopen("scale_weight.txt", "wb");
-    FILE* fp_bias   = fopen("scale_bias.txt", "wb");
-    for (int i = 0; i < ir_graph->node_num; i++)
-    {
-        struct node* noden = ir_graph->node_list[i];
-        std::string op_name = get_op_name_from_type(noden->op.type);
-
-        /* quantize the tensor data from fp32 to uint8 */
-        if (op_name == "Convolution" )
-        {
-            /* Step 3.1 : quant weight */
-            struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]];
-
-            int channel_num = weight_tensor->dims[0];
-            int cstep = int(weight_tensor->elem_num / channel_num);
-            float* weight_data = ( float* )weight_tensor->data;
-            int8_t* i8_weight_data = ( int8_t* )sys_malloc(weight_tensor->elem_num * sizeof(int8_t));
-
-            float* weight_scale_list = ( float* )sys_malloc(channel_num * sizeof(float));
-            int* weight_zp_list = ( int* )sys_malloc(channel_num * sizeof(int));
-
-            fprintf(fp_weight, "%s ", weight_tensor->name);
-            /* calculate the quant scale value of weight perchannel, scale = abs(min, max) / 127 */
-            if (internal)
-            {
-                // TODO
-            }
-            else
-            {
-                for (int ch = 0; ch < channel_num; ch++)
-                {
-                    float* weight_data_ch_start = weight_data + ch * cstep;
-                    float* weight_data_ch_end   = weight_data + (ch + 1) * cstep;
-                    float weight_max = *std::max_element(weight_data_ch_start, weight_data_ch_end);
-                    float weight_min = *std::min_element(weight_data_ch_start, weight_data_ch_end);
-
-                    weight_scale_list[ch] = std::max(abs(weight_max), abs(weight_min)) / 127.f;
-                    weight_zp_list[ch] = 0;
-                    fprintf(fp_weight, "%8.8f ", weight_scale_list[ch]);
-                }
-                fprintf(fp_weight, "\n");
-            }
-//            fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point);
-
-            /* quantize the value of weight from Float32 to Int8, value_i8 = (value_fp32 / scale).round().clip(-127, 127) */
-            for (int ch = 0; ch < channel_num; ch++)
-            {
-                for (int j = 0; j < cstep; j++)
-                {
-                    if (weight_data[ch * cstep + j] == 0 || weight_scale_list[ch] == 0)
-                        i8_weight_data[ch * cstep + j] = 0;
-                    else
-                    {
-                        float int8_data = round(weight_data[ch * cstep + j] / weight_scale_list[ch]);
-                        int8_data = int8_data >  127.f ?  127.f : int8_data;
-                        int8_data = int8_data < -127.f ? -127.f : int8_data;
-                        i8_weight_data[ch * cstep + j] = int8_t(int8_data);
-                    }
-                }
-            }
-
-            weight_tensor->scale_list = weight_scale_list;
-            weight_tensor->zp_list = weight_zp_list;
-            weight_tensor->data_type = TENGINE_DT_INT8;
-            weight_tensor->elem_size = sizeof(int8_t); // int8, signed char
-            weight_tensor->data = i8_weight_data;
-            weight_tensor->quant_param_num = channel_num;
-
-            /* step 3.2 : quant bias */
-            if (noden->input_num > 2)
-            {
-                struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]];
-                struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]];
-
-                float* bias_scale_list = ( float* )sys_malloc(bias_tensor->dims[0] * sizeof(float));
-                int* bias_zp_list = ( int* )sys_malloc(bias_tensor->dims[0] * sizeof(int32_t));
-
-                float* bias_data = ( float* )bias_tensor->data;
-                int* int32_bias_data = ( int* )sys_malloc(bias_tensor->elem_num * sizeof(int32_t));
-
-                int bstep = int(bias_tensor->elem_num / channel_num);
-
-                fprintf(fp_bias, "%s ", bias_tensor->name);
-
-                /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */
-                for (int ch = 0; ch < channel_num; ch++)
-                {
-                    bias_scale_list[ch] = weight_scale_list[ch] * input_tensor->scale;
-                    bias_zp_list[ch] = 0;
-
-                    fprintf(fp_bias, "%8.8f ", bias_scale_list[ch]);
-                }
-                fprintf(fp_bias, "\n");
-
-                /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */
-                for (int ch = 0; ch < channel_num; ch++)
-                {
-                    for (int bi = 0; bi < bstep; bi++)
-                    {
-                        if (bias_data[ch * bstep + bi] == 0 || bias_scale_list[ch] == 0)
-                            int32_bias_data[ch * bstep + bi] = 0;
-                        else
-                            int32_bias_data[ch * bstep + bi] = int(round(bias_data[ch * bstep + bi] / bias_scale_list[ch]));
-                    }
-                }
-
-                bias_tensor->scale_list = bias_scale_list;
-                bias_tensor->zp_list    = bias_zp_list;
-                bias_tensor->data_type  = TENGINE_DT_INT32;
-                bias_tensor->elem_size  = sizeof(int32_t); // int32, signed int
-                bias_tensor->data = int32_bias_data;
-                bias_tensor->quant_param_num = channel_num;
-
-                // fprintf(stderr, "bias   %8.8f \t%s\n", bias_scale_list[0], bias_tensor->name);
-            }
-            // fprintf(stderr, "\n");
-        }
-        else if (op_name == "FullyConnected" || op_name == "Deconvolution")
-        {
-            /* Step 3.1 : quant weight */
-            struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]];
-
-            uint8_t * u8_weight_data = (uint8_t*)sys_malloc(weight_tensor->elem_num * sizeof(uint8_t));
-            float* weight_data       = (float*)weight_tensor->data;
-
-            /* calculate the quant scale value of weight perchannel, scale = (min-max / 255) */
-            float weight_max = 0;
-            float weight_min = 0;
-            float weight_scale = 0;
-            int weight_zero_point = 0;
-
-            if (internal)
-            {
-                weight_scale = weight_tensor->scale;
-                weight_zero_point = weight_tensor->zero_point;
-            }
-            else
-            {
-                weight_max = *std::max_element(weight_data, weight_data + weight_tensor->elem_num);
-                weight_min = *std::min_element(weight_data, weight_data + weight_tensor->elem_num);
-                weight_scale = (weight_max - weight_min) / 255.f;
-                weight_zero_point = int(-weight_min/weight_scale);
-            }
-//            fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point);
-
-            /* quantize the value of weight from Float32 to UInt8, value_u8 = (value_fp32 / scale).round().clip(0, 255) */
-            for (int wi = 0; wi < weight_tensor->elem_num; wi++)
-            {
-                weight_data[wi] = roundf(weight_data[wi] / weight_scale + (float )weight_zero_point);
-                weight_data[wi] = weight_data[wi] > 255.f ? 255.f : weight_data[wi];
-                weight_data[wi] = weight_data[wi] < 0.f   ?   0.f : weight_data[wi];
-                u8_weight_data[wi] = uint8_t(weight_data[wi]);
-            }
-
-            weight_tensor->scale = weight_scale;
-            weight_tensor->zero_point = weight_zero_point;
-            weight_tensor->data_type = TENGINE_DT_UINT8;
-            weight_tensor->elem_size = sizeof(uint8_t);
-            weight_tensor->data = u8_weight_data;
-
-            /* step 3.2 : quant bias */
-            if (noden->input_num > 2)
-            {
-                struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]];
-                struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]];
-
-                int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * bias_tensor->elem_size);
-                float* bias_data     = (float*)bias_tensor->data;
-
-                /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */
-                float bias_scale = input_tensor->scale * weight_tensor->scale;
-
-                /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */
-                for (int bi = 0; bi < bias_tensor->elem_num; bi++)
-                {
-                    if (bias_scale == 0)
-                        int32_bias_data[bi] = 0;
-                    else
-                    {
-                        bias_data[bi] = roundf(bias_data[bi] / bias_scale);
-                        int32_bias_data[bi] = int(bias_data[bi]);
-                    }
-                }
-
-                bias_tensor->scale = bias_scale;
-                bias_tensor->data_type = TENGINE_DT_INT32;
-                bias_tensor->data = int32_bias_data;
-
-//                fprintf(stderr, "[bias]   scale final %8.4f\n", bias_scale);
-            }
-        }
-            /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */
-        else if (op_name == "PReLU")
-        {
-            for (int j = 0; j < noden->input_num; j++)
-            {
-                struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]];
-                if (in_tensor->tensor_type == TENSOR_TYPE_CONST)
-                {
-                    float* fp32_data =  (float*) in_tensor->data;
-                    int data_elem =  in_tensor->elem_num;
-
-                    __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16));
-
-                    for (int k = 0; k < data_elem; k++)
-                    {
-                        fp16_data[k] = fp32_to_fp16(fp32_data[k]);
-                    }
-
-                    in_tensor->data_type = TENGINE_DT_FP16;
-                    in_tensor->data = fp16_data;
-                    in_tensor->quant_param_num = 0;
-                }
-            }
-        }
-        else if (op_name == "Slice")
-        {
-            struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]);
-            struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
-            slice_output_tensor->scale = slice_input_tensor->scale;
-            slice_output_tensor->zero_point = slice_input_tensor->zero_point;
-        }
-    }
-
-    fclose(fp_weight);
-    fclose(fp_bias);
-
-    fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n");
-
-    if (!save_graph(ir_graph, output_file.c_str()))
-    {
-        fprintf(stderr, "save graph failed.\n");
-        return -1;
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 6, save Int8 tmfile done, %s\n", output_file.c_str());
-
-    return 0;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include <algorithm>
+
+#include "quant_save_graph.hpp"
+#include "compiler_fp16.h"
+
+#include "operator/prototype/convolution_param.h"
+#include "operator/prototype/pooling_param.h"
+#include "operator/prototype/relu_param.h"
+
+void recursion_pass_through(struct graph* ir_graph, const char* layer_name, struct tensor* t,
+                            std::tr1::unordered_map<std::string, int>& layer_used, std::tr1::unordered_map<std::string, float>& layer_scale,
+                            std::tr1::unordered_map<std::string, float>& layer_zeropoint, std::tr1::unordered_map<std::string, bool>& layer_pass)
+{
+    if (layer_pass[t->name] == false && layer_used[t->name] < 2)
+    {
+        t->scale = layer_scale[layer_name];
+        t->zero_point = layer_zeropoint[layer_name];
+        layer_scale[t->name] = layer_scale[layer_name];
+        layer_zeropoint[t->name] = layer_zeropoint[layer_name];
+
+        uint32_t ir_node_idx = t->producer;
+        struct node* t_node = ir_graph->node_list[ir_node_idx];
+
+        std::string op_name = get_op_name_from_type(t_node->op.type);
+        bool poolTrue = false;
+        bool reluTrue = false;
+        if (op_name == "Pooling")
+        {
+            struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem;
+            if (pool_param->pool_method == 0)
+                poolTrue = true;
+        }
+        else if (op_name == "ReLU")
+        {
+            struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem;
+            if (relu_param->negative_slope == 0.f)
+                reluTrue = true;
+        }
+        if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || poolTrue || reluTrue)
+        {
+            struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]];
+            if (layer_scale[t->name] != 0)
+            {
+                if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT)
+                {
+                    recursion_pass_through(ir_graph, t->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass);
+                }
+            }
+        }
+        layer_pass[t->name] = true;
+    }
+}
+
+int save_graph_u8_perlayer(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal)
+{
+    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n");
+
+    /* Step 1 : create graph, load tengine model xxx.tmfile */
+    struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file);
+    if (nullptr == ir_graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n");
+
+    std::tr1::unordered_map<std::string, float> layer_scale;
+    std::tr1::unordered_map<std::string, float> layer_zeropoint;
+
+    fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file);
+    /* Step 2 : set activation quant scale value into ir_tensor */
+    if (nullptr != scale_file)
+    {
+        std::ifstream scales(scale_file);
+        std::string line;
+        while (std::getline(scales, line))
+        {
+            std::string layer_name;
+            float scale_val = 0.f;
+            float zero_point = 0.f;
+            size_t last = 0;
+            size_t index = line.find_first_of(' ', last);
+            size_t idx = line.find_last_of(' ', line.size());
+            layer_name = line.substr(last, index - last);
+            last = index + 1;
+            scale_val = atof((line.substr(last, line.size() - last)).c_str());
+            zero_point = atof((line.substr(idx + 1, line.size())).c_str());
+
+            layer_scale[layer_name] = scale_val;
+            layer_zeropoint[layer_name] = zero_point;
+
+            //            fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point);
+        }
+    }
+
+    std::tr1::unordered_map<std::string, int> layer_used;
+    for (int i = 0; i < ir_graph->node_num; i++)
+    {
+        struct node* ir_node = ir_graph->node_list[i];
+        for (int j = 0; j < ir_node->input_num; j++)
+        {
+            std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name;
+            layer_used[layern]++;
+        }
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n");
+    /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */
+    if (inplace == 0)
+    {
+        for (int i = 0; i < ir_graph->tensor_num; i++)
+        {
+            struct tensor* ir_tensor = ir_graph->tensor_list[i];
+            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+            {
+                ir_tensor->scale = layer_scale[ir_tensor->name];
+                ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
+            }
+        }
+    }
+    else
+    {
+        std::tr1::unordered_map<std::string, bool> layer_pass;
+        for (int i = ir_graph->tensor_num - 1; i >= 0; i--)
+        {
+            struct tensor* ir_tensor = ir_graph->tensor_list[i];
+            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+            {
+                if (layer_pass[ir_tensor->name] == false)
+                {
+                    uint32_t ir_node_idx = ir_tensor->producer;
+                    struct node* t_node = ir_graph->node_list[ir_node_idx];
+
+                    std::string op_name = get_op_name_from_type(t_node->op.type);
+
+                    bool poolTrue = false;
+                    bool reluTrue = false;
+                    if (op_name == "Pooling")
+                    {
+                        struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem;
+                        if (pool_param->pool_method == 0)
+                            poolTrue = true;
+                    }
+                    else if (op_name == "ReLU")
+                    {
+                        struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem;
+                        if (relu_param->negative_slope == 0.f)
+                            reluTrue = true;
+                    }
+
+                    if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || op_name == "Slice" || poolTrue || reluTrue)
+                    {
+                        struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]];
+                        if (layer_scale[ir_tensor->name] != 0)
+                        {
+                            ir_tensor->scale = layer_scale[ir_tensor->name];
+                            ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
+
+                            if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT)
+                            {
+                                recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        ir_tensor->scale = layer_scale[ir_tensor->name];
+                        ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
+                    }
+                    layer_pass[ir_tensor->name] = true;
+                }
+            }
+        }
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n");
+
+    /* Set the params of acitvation ir_tensor */
+    for (int i = 0; i < ir_graph->tensor_num; i++)
+    {
+        struct tensor* ir_tensor = ir_graph->tensor_list[i];
+        if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+        {
+            ir_tensor->data_type = TENGINE_DT_UINT8;
+            ir_tensor->elem_size = sizeof(uint8_t);
+        }
+        ir_tensor->quant_param_num = 1;
+    }
+
+    /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */
+    for (int i = 0; i < ir_graph->node_num; i++)
+    {
+        struct node* noden = ir_graph->node_list[i];
+        std::string op_name = get_op_name_from_type(noden->op.type);
+
+        /* quantize the tensor data from fp32 to uint8 */
+        if (op_name == "Convolution" || op_name == "FullyConnected" || op_name == "Deconvolution")
+        {
+            /* Step 3.1 : quant weight */
+            struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]];
+
+            uint8_t* u8_weight_data = (uint8_t*)sys_malloc(weight_tensor->elem_num * sizeof(uint8_t));
+            float* weight_data = (float*)weight_tensor->data;
+
+            /* calculate the quant scale value of weight perchannel, scale = (min-max / 255) */
+            float weight_max = 0;
+            float weight_min = 0;
+            float weight_scale = 0;
+            int weight_zero_point = 0;
+
+            if (internal)
+            {
+                weight_scale = weight_tensor->scale;
+                weight_zero_point = weight_tensor->zero_point;
+            }
+            else
+            {
+                weight_max = *std::max_element(weight_data, weight_data + weight_tensor->elem_num);
+                weight_min = *std::min_element(weight_data, weight_data + weight_tensor->elem_num);
+                weight_scale = (weight_max - weight_min) / 255.f;
+                weight_zero_point = int(-weight_min / weight_scale);
+            }
+            //            fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point);
+
+            /* quantize the value of weight from Float32 to UInt8, value_u8 = (value_fp32 / scale).round().clip(0, 255) */
+            for (int wi = 0; wi < weight_tensor->elem_num; wi++)
+            {
+                weight_data[wi] = roundf(weight_data[wi] / weight_scale + (float)weight_zero_point);
+                weight_data[wi] = weight_data[wi] > 255.f ? 255.f : weight_data[wi];
+                weight_data[wi] = weight_data[wi] < 0.f ? 0.f : weight_data[wi];
+                u8_weight_data[wi] = uint8_t(weight_data[wi]);
+            }
+
+            weight_tensor->scale = weight_scale;
+            weight_tensor->zero_point = weight_zero_point;
+            weight_tensor->data_type = TENGINE_DT_UINT8;
+            weight_tensor->elem_size = sizeof(uint8_t);
+            weight_tensor->data = u8_weight_data;
+
+            /* step 3.2 : quant bias */
+            if (noden->input_num > 2)
+            {
+                struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]];
+                struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]];
+
+                int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * bias_tensor->elem_size);
+                float* bias_data = (float*)bias_tensor->data;
+
+                /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */
+                float bias_scale = input_tensor->scale * weight_tensor->scale;
+
+                /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */
+                for (int bi = 0; bi < bias_tensor->elem_num; bi++)
+                {
+                    if (bias_scale == 0)
+                        int32_bias_data[bi] = 0;
+                    else
+                    {
+                        bias_data[bi] = roundf(bias_data[bi] / bias_scale);
+                        int32_bias_data[bi] = int(bias_data[bi]);
+                    }
+                }
+
+                bias_tensor->scale = bias_scale;
+                bias_tensor->data_type = TENGINE_DT_INT32;
+                bias_tensor->data = int32_bias_data;
+
+                //                fprintf(stderr, "[bias]   scale final %8.4f\n", bias_scale);
+            }
+        }
+        /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */
+        else if (op_name == "PReLU")
+        {
+            for (int j = 0; j < noden->input_num; j++)
+            {
+                struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]];
+                if (in_tensor->tensor_type == TENSOR_TYPE_CONST)
+                {
+                    float* fp32_data = (float*)in_tensor->data;
+                    int data_elem = in_tensor->elem_num;
+
+                    __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16));
+
+                    for (int k = 0; k < data_elem; k++)
+                    {
+                        fp16_data[k] = fp32_to_fp16(fp32_data[k]);
+                    }
+
+                    in_tensor->data_type = TENGINE_DT_FP16;
+                    in_tensor->data = fp16_data;
+                    in_tensor->quant_param_num = 0;
+                }
+            }
+        }
+        else if (op_name == "Slice")
+        {
+            struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]);
+            struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
+            slice_output_tensor->scale = slice_input_tensor->scale;
+            slice_output_tensor->zero_point = slice_input_tensor->zero_point;
+        }
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n");
+
+    if (!save_graph(ir_graph, output_file.c_str()))
+    {
+        fprintf(stderr, "save graph failed.\n");
+        return -1;
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 6, save UInt8 tmfile done, %s\n", output_file.c_str());
+
+    return 0;
+}
+
+int save_graph_i8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal)
+{
+    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n");
+
+    /* Step 1 : create graph, load tengine model xxx.tmfile */
+    struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file);
+    if (nullptr == ir_graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n");
+
+    std::tr1::unordered_map<std::string, float> layer_scale;
+    std::tr1::unordered_map<std::string, float> layer_zeropoint;
+
+    fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file);
+    /* Step 2 : set activation quant scale value into ir_tensor */
+    if (nullptr != scale_file)
+    {
+        std::ifstream scales(scale_file);
+        std::string line;
+        while (std::getline(scales, line))
+        {
+            std::string layer_name;
+            float scale_val = 0.f;
+            float zero_point = 0.f;
+            size_t last = 0;
+            size_t index = line.find_first_of(' ', last);
+            size_t idx = line.find_last_of(' ', line.size());
+            layer_name = line.substr(last, index - last);
+            last = index + 1;
+            scale_val = atof((line.substr(last, line.size() - last)).c_str());
+            zero_point = atof((line.substr(idx + 1, line.size())).c_str());
+
+            layer_scale[layer_name] = scale_val;
+            layer_zeropoint[layer_name] = zero_point;
+
+            //            fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point);
+        }
+    }
+
+    std::tr1::unordered_map<std::string, int> layer_used;
+    for (int i = 0; i < ir_graph->node_num; i++)
+    {
+        struct node* ir_node = ir_graph->node_list[i];
+        for (int j = 0; j < ir_node->input_num; j++)
+        {
+            std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name;
+            layer_used[layern]++;
+        }
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n");
+    /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */
+    if (inplace == 0)
+    {
+        for (int i = 0; i < ir_graph->tensor_num; i++)
+        {
+            struct tensor* ir_tensor = ir_graph->tensor_list[i];
+            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+            {
+                ir_tensor->scale = layer_scale[ir_tensor->name];
+                ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
+            }
+        }
+    }
+    else
+    {
+        std::tr1::unordered_map<std::string, bool> layer_pass;
+        for (int i = ir_graph->tensor_num - 1; i >= 0; i--)
+        {
+            struct tensor* ir_tensor = ir_graph->tensor_list[i];
+            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+            {
+                if (layer_pass[ir_tensor->name] == false)
+                {
+                    uint32_t ir_node_idx = ir_tensor->producer;
+                    struct node* t_node = ir_graph->node_list[ir_node_idx];
+
+                    std::string op_name = get_op_name_from_type(t_node->op.type);
+
+                    bool poolTrue = false;
+                    bool reluTrue = false;
+                    if (op_name == "Pooling")
+                    {
+                        struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem;
+                        if (pool_param->pool_method == 0)
+                            poolTrue = true;
+                    }
+                    else if (op_name == "ReLU")
+                    {
+                        struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem;
+                        if (relu_param->negative_slope == 0.f)
+                            reluTrue = true;
+                    }
+
+                    if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || op_name == "Slice" || poolTrue || reluTrue)
+                    {
+                        struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]];
+                        if (layer_scale[ir_tensor->name] != 0)
+                        {
+                            ir_tensor->scale = layer_scale[ir_tensor->name];
+                            ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
+
+                            if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT)
+                            {
+                                recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        ir_tensor->scale = layer_scale[ir_tensor->name];
+                        ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
+                    }
+                    layer_pass[ir_tensor->name] = true;
+                }
+            }
+        }
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n");
+
+    /* Set the params of acitvation ir_tensor */
+    for (int i = 0; i < ir_graph->tensor_num; i++)
+    {
+        struct tensor* ir_tensor = ir_graph->tensor_list[i];
+        if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+        {
+            ir_tensor->data_type = TENGINE_DT_INT8;
+            ir_tensor->elem_size = sizeof(int8_t);
+        }
+        ir_tensor->quant_param_num = 1;
+    }
+
+    /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */
+    FILE* fp_weight = fopen("scale_weight.txt", "wb");
+    FILE* fp_bias = fopen("scale_bias.txt", "wb");
+    for (int i = 0; i < ir_graph->node_num; i++)
+    {
+        struct node* noden = ir_graph->node_list[i];
+        std::string op_name = get_op_name_from_type(noden->op.type);
+
+        /* quantize the tensor data from fp32 to uint8 */
+        if (op_name == "Convolution" || op_name == "FullyConnected" || op_name == "Deconvolution")
+        {
+            /* Step 3.1 : quant weight */
+            struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]];
+
+            int channel_num = weight_tensor->dims[0];
+            int cstep = int(weight_tensor->elem_num / channel_num);
+            float* weight_data = (float*)weight_tensor->data;
+            int8_t* i8_weight_data = (int8_t*)sys_malloc(weight_tensor->elem_num * sizeof(int8_t));
+
+            float* weight_scale_list = (float*)sys_malloc(channel_num * sizeof(float));
+            int* weight_zp_list = (int*)sys_malloc(channel_num * sizeof(int));
+
+            fprintf(fp_weight, "%s ", weight_tensor->name);
+            /* calculate the quant scale value of weight perchannel, scale = abs(min, max) / 127 */
+            if (internal)
+            {
+                // TODO
+            }
+            else
+            {
+                for (int ch = 0; ch < channel_num; ch++)
+                {
+                    float* weight_data_ch_start = weight_data + ch * cstep;
+                    float* weight_data_ch_end = weight_data + (ch + 1) * cstep;
+                    float weight_max = *std::max_element(weight_data_ch_start, weight_data_ch_end);
+                    float weight_min = *std::min_element(weight_data_ch_start, weight_data_ch_end);
+
+                    weight_scale_list[ch] = std::max(abs(weight_max), abs(weight_min)) / 127.f;
+                    weight_zp_list[ch] = 0;
+                    fprintf(fp_weight, "%8.8f ", weight_scale_list[ch]);
+                }
+                fprintf(fp_weight, "\n");
+            }
+            //            fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point);
+
+            /* quantize the value of weight from Float32 to Int8, value_i8 = (value_fp32 / scale).round().clip(-127, 127) */
+            for (int ch = 0; ch < channel_num; ch++)
+            {
+                for (int j = 0; j < cstep; j++)
+                {
+                    if (weight_data[ch * cstep + j] == 0 || weight_scale_list[ch] == 0)
+                        i8_weight_data[ch * cstep + j] = 0;
+                    else
+                    {
+                        float int8_data = round(weight_data[ch * cstep + j] / weight_scale_list[ch]);
+                        int8_data = int8_data > 127.f ? 127.f : int8_data;
+                        int8_data = int8_data < -127.f ? -127.f : int8_data;
+                        i8_weight_data[ch * cstep + j] = int8_t(int8_data);
+                    }
+                }
+            }
+
+            weight_tensor->scale_list = weight_scale_list;
+            weight_tensor->zp_list = weight_zp_list;
+            weight_tensor->data_type = TENGINE_DT_INT8;
+            weight_tensor->elem_size = sizeof(int8_t); // int8, signed char
+            weight_tensor->data = i8_weight_data;
+            weight_tensor->quant_param_num = channel_num;
+
+            /* step 3.2 : quant bias */
+            if (noden->input_num > 2)
+            {
+                struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]];
+                struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]];
+
+                float* bias_scale_list = (float*)sys_malloc(bias_tensor->dims[0] * sizeof(float));
+                int* bias_zp_list = (int*)sys_malloc(bias_tensor->dims[0] * sizeof(int32_t));
+
+                float* bias_data = (float*)bias_tensor->data;
+                int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * sizeof(int32_t));
+
+                int bstep = int(bias_tensor->elem_num / channel_num);
+
+                fprintf(fp_bias, "%s ", bias_tensor->name);
+
+                /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */
+                for (int ch = 0; ch < channel_num; ch++)
+                {
+                    bias_scale_list[ch] = weight_scale_list[ch] * input_tensor->scale;
+                    bias_zp_list[ch] = 0;
+
+                    fprintf(fp_bias, "%8.8f ", bias_scale_list[ch]);
+                }
+                fprintf(fp_bias, "\n");
+
+                /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */
+                for (int ch = 0; ch < channel_num; ch++)
+                {
+                    for (int bi = 0; bi < bstep; bi++)
+                    {
+                        if (bias_data[ch * bstep + bi] == 0 || bias_scale_list[ch] == 0)
+                            int32_bias_data[ch * bstep + bi] = 0;
+                        else
+                            int32_bias_data[ch * bstep + bi] = int(round(bias_data[ch * bstep + bi] / bias_scale_list[ch]));
+                    }
+                }
+
+                bias_tensor->scale_list = bias_scale_list;
+                bias_tensor->zp_list = bias_zp_list;
+                bias_tensor->data_type = TENGINE_DT_INT32;
+                bias_tensor->elem_size = sizeof(int32_t); // int32, signed int
+                bias_tensor->data = int32_bias_data;
+                bias_tensor->quant_param_num = channel_num;
+
+                // fprintf(stderr, "bias   %8.8f \t%s\n", bias_scale_list[0], bias_tensor->name);
+            }
+            // fprintf(stderr, "\n");
+        }
+        /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */
+        else if (op_name == "PReLU")
+        {
+            for (int j = 0; j < noden->input_num; j++)
+            {
+                struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]];
+                if (in_tensor->tensor_type == TENSOR_TYPE_CONST)
+                {
+                    float* fp32_data = (float*)in_tensor->data;
+                    int data_elem = in_tensor->elem_num;
+
+                    __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16));
+
+                    for (int k = 0; k < data_elem; k++)
+                    {
+                        fp16_data[k] = fp32_to_fp16(fp32_data[k]);
+                    }
+
+                    in_tensor->data_type = TENGINE_DT_FP16;
+                    in_tensor->data = fp16_data;
+                    in_tensor->quant_param_num = 0;
+                }
+            }
+        }
+        else if (op_name == "Slice")
+        {
+            struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]);
+            struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
+            slice_output_tensor->scale = slice_input_tensor->scale;
+            slice_output_tensor->zero_point = slice_input_tensor->zero_point;
+        }
+    }
+
+    fclose(fp_weight);
+    fclose(fp_bias);
+
+    fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n");
+
+    if (!save_graph(ir_graph, output_file.c_str()))
+    {
+        fprintf(stderr, "save graph failed.\n");
+        return -1;
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 6, save Int8 tmfile done, %s\n", output_file.c_str());
+
+    return 0;
+}
+
+int save_graph_u8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal)
+{
+    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again\n");
+
+    /* Step 1 : create graph, load tengine model xxx.tmfile */
+    struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file);
+    if (nullptr == ir_graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+    fprintf(stderr, "[Quant Tools Info]: Step 3, load FP32 tmfile once again done.\n");
+
+    std::tr1::unordered_map<std::string, float> layer_scale;
+    std::tr1::unordered_map<std::string, float> layer_zeropoint;
+
+    fprintf(stderr, "[Quant Tools Info]: Step 3, load calibration table file %s.\n", scale_file);
+    /* Step 2 : set activation quant scale value into ir_tensor */
+    if (nullptr != scale_file)
+    {
+        std::ifstream scales(scale_file);
+        std::string line;
+        while (std::getline(scales, line))
+        {
+            std::string layer_name;
+            float scale_val = 0.f;
+            float zero_point = 0.f;
+            size_t last = 0;
+            size_t index = line.find_first_of(' ', last);
+            size_t idx = line.find_last_of(' ', line.size());
+            layer_name = line.substr(last, index - last);
+            last = index + 1;
+            scale_val = atof((line.substr(last, line.size() - last)).c_str());
+            zero_point = atof((line.substr(idx + 1, line.size())).c_str());
+
+            layer_scale[layer_name] = scale_val;
+            layer_zeropoint[layer_name] = zero_point;
+
+            //            fprintf(stderr, "[%s] \tscale final %8.4f, zero point %8.4f\n", layer_name.c_str(), scale_val, zero_point);
+        }
+    }
+
+    std::tr1::unordered_map<std::string, int> layer_used;
+    for (int i = 0; i < ir_graph->node_num; i++)
+    {
+        struct node* ir_node = ir_graph->node_list[i];
+        for (int j = 0; j < ir_node->input_num; j++)
+        {
+            std::string layern = ir_graph->tensor_list[ir_node->input_tensors[j]]->name;
+            layer_used[layern]++;
+        }
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 4, optimize the calibration table.\n");
+    /* process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip .... */
+    if (inplace == 0)
+    {
+        for (int i = 0; i < ir_graph->tensor_num; i++)
+        {
+            struct tensor* ir_tensor = ir_graph->tensor_list[i];
+            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+            {
+                ir_tensor->scale = layer_scale[ir_tensor->name];
+                ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
+            }
+        }
+    }
+    else
+    {
+        std::tr1::unordered_map<std::string, bool> layer_pass;
+        for (int i = ir_graph->tensor_num - 1; i >= 0; i--)
+        {
+            struct tensor* ir_tensor = ir_graph->tensor_list[i];
+            if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+            {
+                if (layer_pass[ir_tensor->name] == false)
+                {
+                    uint32_t ir_node_idx = ir_tensor->producer;
+                    struct node* t_node = ir_graph->node_list[ir_node_idx];
+
+                    std::string op_name = get_op_name_from_type(t_node->op.type);
+
+                    bool poolTrue = false;
+                    bool reluTrue = false;
+                    if (op_name == "Pooling")
+                    {
+                        struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem;
+                        if (pool_param->pool_method == 0)
+                            poolTrue = true;
+                    }
+                    else if (op_name == "ReLU")
+                    {
+                        struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem;
+                        if (relu_param->negative_slope == 0.f)
+                            reluTrue = true;
+                    }
+
+                    if (op_name == "Flatten" || op_name == "Reshape" || op_name == "Squeeze" || op_name == "Clip" || op_name == "Slice" || poolTrue || reluTrue)
+                    {
+                        struct tensor* t_in_tensor = ir_graph->tensor_list[t_node->input_tensors[0]];
+                        if (layer_scale[ir_tensor->name] != 0)
+                        {
+                            ir_tensor->scale = layer_scale[ir_tensor->name];
+                            ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
+
+                            if (t_in_tensor->tensor_type == TENSOR_TYPE_VAR || t_in_tensor->tensor_type == TENSOR_TYPE_INPUT)
+                            {
+                                recursion_pass_through(ir_graph, ir_tensor->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        ir_tensor->scale = layer_scale[ir_tensor->name];
+                        ir_tensor->zero_point = layer_zeropoint[ir_tensor->name];
+                    }
+                    layer_pass[ir_tensor->name] = true;
+                }
+            }
+        }
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 4, quantize activation tensor done.\n");
+
+    /* Set the params of acitvation ir_tensor */
+    for (int i = 0; i < ir_graph->tensor_num; i++)
+    {
+        struct tensor* ir_tensor = ir_graph->tensor_list[i];
+        if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+        {
+            ir_tensor->data_type = TENGINE_DT_UINT8;
+            ir_tensor->elem_size = sizeof(uint8_t);
+        }
+        ir_tensor->quant_param_num = 1;
+    }
+
+    /* Step 3 : set weight/bias quant scale value into ir_tensor, quant the weight params from Float32 to Int8 */
+    FILE* fp_weight = fopen("scale_weight.txt", "wb");
+    FILE* fp_bias = fopen("scale_bias.txt", "wb");
+    for (int i = 0; i < ir_graph->node_num; i++)
+    {
+        struct node* noden = ir_graph->node_list[i];
+        std::string op_name = get_op_name_from_type(noden->op.type);
+
+        /* quantize the tensor data from fp32 to uint8 */
+        if (op_name == "Convolution")
+        {
+            /* Step 3.1 : quant weight */
+            struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]];
+
+            int channel_num = weight_tensor->dims[0];
+            int cstep = int(weight_tensor->elem_num / channel_num);
+            float* weight_data = (float*)weight_tensor->data;
+            int8_t* i8_weight_data = (int8_t*)sys_malloc(weight_tensor->elem_num * sizeof(int8_t));
+
+            float* weight_scale_list = (float*)sys_malloc(channel_num * sizeof(float));
+            int* weight_zp_list = (int*)sys_malloc(channel_num * sizeof(int));
+
+            fprintf(fp_weight, "%s ", weight_tensor->name);
+            /* calculate the quant scale value of weight perchannel, scale = abs(min, max) / 127 */
+            if (internal)
+            {
+                // TODO
+            }
+            else
+            {
+                for (int ch = 0; ch < channel_num; ch++)
+                {
+                    float* weight_data_ch_start = weight_data + ch * cstep;
+                    float* weight_data_ch_end = weight_data + (ch + 1) * cstep;
+                    float weight_max = *std::max_element(weight_data_ch_start, weight_data_ch_end);
+                    float weight_min = *std::min_element(weight_data_ch_start, weight_data_ch_end);
+
+                    weight_scale_list[ch] = std::max(abs(weight_max), abs(weight_min)) / 127.f;
+                    weight_zp_list[ch] = 0;
+                    fprintf(fp_weight, "%8.8f ", weight_scale_list[ch]);
+                }
+                fprintf(fp_weight, "\n");
+            }
+            //            fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point);
+
+            /* quantize the value of weight from Float32 to Int8, value_i8 = (value_fp32 / scale).round().clip(-127, 127) */
+            for (int ch = 0; ch < channel_num; ch++)
+            {
+                for (int j = 0; j < cstep; j++)
+                {
+                    if (weight_data[ch * cstep + j] == 0 || weight_scale_list[ch] == 0)
+                        i8_weight_data[ch * cstep + j] = 0;
+                    else
+                    {
+                        float int8_data = round(weight_data[ch * cstep + j] / weight_scale_list[ch]);
+                        int8_data = int8_data > 127.f ? 127.f : int8_data;
+                        int8_data = int8_data < -127.f ? -127.f : int8_data;
+                        i8_weight_data[ch * cstep + j] = int8_t(int8_data);
+                    }
+                }
+            }
+
+            weight_tensor->scale_list = weight_scale_list;
+            weight_tensor->zp_list = weight_zp_list;
+            weight_tensor->data_type = TENGINE_DT_INT8;
+            weight_tensor->elem_size = sizeof(int8_t); // int8, signed char
+            weight_tensor->data = i8_weight_data;
+            weight_tensor->quant_param_num = channel_num;
+
+            /* step 3.2 : quant bias */
+            if (noden->input_num > 2)
+            {
+                struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]];
+                struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]];
+
+                float* bias_scale_list = (float*)sys_malloc(bias_tensor->dims[0] * sizeof(float));
+                int* bias_zp_list = (int*)sys_malloc(bias_tensor->dims[0] * sizeof(int32_t));
+
+                float* bias_data = (float*)bias_tensor->data;
+                int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * sizeof(int32_t));
+
+                int bstep = int(bias_tensor->elem_num / channel_num);
+
+                fprintf(fp_bias, "%s ", bias_tensor->name);
+
+                /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */
+                for (int ch = 0; ch < channel_num; ch++)
+                {
+                    bias_scale_list[ch] = weight_scale_list[ch] * input_tensor->scale;
+                    bias_zp_list[ch] = 0;
+
+                    fprintf(fp_bias, "%8.8f ", bias_scale_list[ch]);
+                }
+                fprintf(fp_bias, "\n");
+
+                /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */
+                for (int ch = 0; ch < channel_num; ch++)
+                {
+                    for (int bi = 0; bi < bstep; bi++)
+                    {
+                        if (bias_data[ch * bstep + bi] == 0 || bias_scale_list[ch] == 0)
+                            int32_bias_data[ch * bstep + bi] = 0;
+                        else
+                            int32_bias_data[ch * bstep + bi] = int(round(bias_data[ch * bstep + bi] / bias_scale_list[ch]));
+                    }
+                }
+
+                bias_tensor->scale_list = bias_scale_list;
+                bias_tensor->zp_list = bias_zp_list;
+                bias_tensor->data_type = TENGINE_DT_INT32;
+                bias_tensor->elem_size = sizeof(int32_t); // int32, signed int
+                bias_tensor->data = int32_bias_data;
+                bias_tensor->quant_param_num = channel_num;
+
+                // fprintf(stderr, "bias   %8.8f \t%s\n", bias_scale_list[0], bias_tensor->name);
+            }
+            // fprintf(stderr, "\n");
+        }
+        else if (op_name == "FullyConnected" || op_name == "Deconvolution")
+        {
+            /* Step 3.1 : quant weight */
+            struct tensor* weight_tensor = ir_graph->tensor_list[noden->input_tensors[1]];
+
+            uint8_t* u8_weight_data = (uint8_t*)sys_malloc(weight_tensor->elem_num * sizeof(uint8_t));
+            float* weight_data = (float*)weight_tensor->data;
+
+            /* calculate the quant scale value of weight perchannel, scale = (min-max / 255) */
+            float weight_max = 0;
+            float weight_min = 0;
+            float weight_scale = 0;
+            int weight_zero_point = 0;
+
+            if (internal)
+            {
+                weight_scale = weight_tensor->scale;
+                weight_zero_point = weight_tensor->zero_point;
+            }
+            else
+            {
+                weight_max = *std::max_element(weight_data, weight_data + weight_tensor->elem_num);
+                weight_min = *std::min_element(weight_data, weight_data + weight_tensor->elem_num);
+                weight_scale = (weight_max - weight_min) / 255.f;
+                weight_zero_point = int(-weight_min / weight_scale);
+            }
+            //            fprintf(stderr, "[weight] scale final %8.4f, zero point %4d\n", weight_scale, weight_zero_point);
+
+            /* quantize the value of weight from Float32 to UInt8, value_u8 = (value_fp32 / scale).round().clip(0, 255) */
+            for (int wi = 0; wi < weight_tensor->elem_num; wi++)
+            {
+                weight_data[wi] = roundf(weight_data[wi] / weight_scale + (float)weight_zero_point);
+                weight_data[wi] = weight_data[wi] > 255.f ? 255.f : weight_data[wi];
+                weight_data[wi] = weight_data[wi] < 0.f ? 0.f : weight_data[wi];
+                u8_weight_data[wi] = uint8_t(weight_data[wi]);
+            }
+
+            weight_tensor->scale = weight_scale;
+            weight_tensor->zero_point = weight_zero_point;
+            weight_tensor->data_type = TENGINE_DT_UINT8;
+            weight_tensor->elem_size = sizeof(uint8_t);
+            weight_tensor->data = u8_weight_data;
+
+            /* step 3.2 : quant bias */
+            if (noden->input_num > 2)
+            {
+                struct tensor* input_tensor = ir_graph->tensor_list[noden->input_tensors[0]];
+                struct tensor* bias_tensor = ir_graph->tensor_list[noden->input_tensors[2]];
+
+                int* int32_bias_data = (int*)sys_malloc(bias_tensor->elem_num * bias_tensor->elem_size);
+                float* bias_data = (float*)bias_tensor->data;
+
+                /* calculate the quant scale value of bias perchannel, scale = scale_weight * scale_in */
+                float bias_scale = input_tensor->scale * weight_tensor->scale;
+
+                /* quantize the value of bias from Float32 to Int32, value_i32 = (value_fp32 / scale).round() */
+                for (int bi = 0; bi < bias_tensor->elem_num; bi++)
+                {
+                    if (bias_scale == 0)
+                        int32_bias_data[bi] = 0;
+                    else
+                    {
+                        bias_data[bi] = roundf(bias_data[bi] / bias_scale);
+                        int32_bias_data[bi] = int(bias_data[bi]);
+                    }
+                }
+
+                bias_tensor->scale = bias_scale;
+                bias_tensor->data_type = TENGINE_DT_INT32;
+                bias_tensor->data = int32_bias_data;
+
+                //                fprintf(stderr, "[bias]   scale final %8.4f\n", bias_scale);
+            }
+        }
+        /* quantize the tensor data from fp32 to fp16, for TIM-VX NPU IP */
+        else if (op_name == "PReLU")
+        {
+            for (int j = 0; j < noden->input_num; j++)
+            {
+                struct tensor* in_tensor = ir_graph->tensor_list[noden->input_tensors[j]];
+                if (in_tensor->tensor_type == TENSOR_TYPE_CONST)
+                {
+                    float* fp32_data = (float*)in_tensor->data;
+                    int data_elem = in_tensor->elem_num;
+
+                    __fp16* fp16_data = (__fp16*)sys_malloc(data_elem * sizeof(__fp16));
+
+                    for (int k = 0; k < data_elem; k++)
+                    {
+                        fp16_data[k] = fp32_to_fp16(fp32_data[k]);
+                    }
+
+                    in_tensor->data_type = TENGINE_DT_FP16;
+                    in_tensor->data = fp16_data;
+                    in_tensor->quant_param_num = 0;
+                }
+            }
+        }
+        else if (op_name == "Slice")
+        {
+            struct tensor* slice_input_tensor = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]);
+            struct tensor* slice_output_tensor = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
+            slice_output_tensor->scale = slice_input_tensor->scale;
+            slice_output_tensor->zero_point = slice_input_tensor->zero_point;
+        }
+    }
+
+    fclose(fp_weight);
+    fclose(fp_bias);
+
+    fprintf(stderr, "[Quant Tools Info]: Step 5, quantize weight tensor done.\n");
+
+    if (!save_graph(ir_graph, output_file.c_str()))
+    {
+        fprintf(stderr, "save graph failed.\n");
+        return -1;
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 6, save Int8 tmfile done, %s\n", output_file.c_str());
+
+    return 0;
+}
diff --git a/tools/quantize/quant_save_graph.hpp b/tools/quantize/quant_save_graph.hpp
index e23f385df..ad6fed617 100644
--- a/tools/quantize/quant_save_graph.hpp
+++ b/tools/quantize/quant_save_graph.hpp
@@ -1,53 +1,53 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: hhchen@openailab.com
- */
-
-#include <cstdlib>
-#include <cstdio>
-#include <sys/stat.h>
-#include <dirent.h>
-
-#include <fstream> 
-#include <string>
-#include <cmath>
-#include <tr1/unordered_map>
-
-#include "quant_utils.hpp"
-#include "save_graph.hpp"
-
-#include "tengine/c_api.h"
-
-extern "C" {
-    #include "graph/graph.h"
-    #include "graph/subgraph.h"
-    #include "graph/node.h"
-    #include "graph/tensor.h"
-    #include "utility/sys_port.h"
-    #include "utility/utils.h"
-}
-
-int save_graph_u8_perlayer(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal);
-
-int save_graph_i8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal);
-
-int save_graph_u8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal);
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include <cstdlib>
+#include <cstdio>
+#include <sys/stat.h>
+#include <dirent.h>
+
+#include <fstream>
+#include <string>
+#include <cmath>
+#include <tr1/unordered_map>
+
+#include "quant_utils.hpp"
+#include "save_graph.hpp"
+
+#include "tengine/c_api.h"
+
+extern "C" {
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "utility/sys_port.h"
+#include "utility/utils.h"
+}
+
+int save_graph_u8_perlayer(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal);
+
+int save_graph_i8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal);
+
+int save_graph_u8_perchannel(const char* model_file, const char* scale_file, const std::string& output_file, int inplace, bool internal);
diff --git a/tools/quantize/quant_tool.hpp b/tools/quantize/quant_tool.hpp
index b07ec275e..35213bfa2 100644
--- a/tools/quantize/quant_tool.hpp
+++ b/tools/quantize/quant_tool.hpp
@@ -22,13 +22,11 @@
  * Author: hhchen@openailab.com
  */
 
-
 #include <string>
 #include <vector>
 #include <unordered_map>
 
-extern "C"
-{
+extern "C" {
 #include "tengine/c_api.h"
 #include "graph/graph.h"
 #include "graph/subgraph.h"
@@ -41,7 +39,6 @@ extern "C"
 #define ALGORITHM_MIN_MAX 0
 #define ALGORITHM_KL      1
 
-
 class QuantTool
 {
 public:
@@ -49,26 +46,27 @@ class QuantTool
     ~QuantTool();
 
     int activation_quant_tool();
-public: 
+
+public:
     struct options opt;
 
-    std::string model_file; // path to input float32 tmfile
-    std::string scale_file; // path to calibration scale file
-    std::string output_file;// path to output int8/uint8 tmfile
-    std::string image_dir;  // path to calibration images folder
+    std::string model_file;  // path to input float32 tmfile
+    std::string scale_file;  // path to calibration scale file
+    std::string output_file; // path to output int8/uint8 tmfile
+    std::string image_dir;   // path to calibration images folder
 
     int num_thread;
-    
+
     int img_c;
     int img_h;
     int img_w;
-    float mean[3];          // value of mean (mean value, default is 104.0,117.0,123.0)
-    float scale[3];         // value of normalize (scale value, default is 1.0,1.0,1.0)
-    int center_crop;        // flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0)
+    float mean[3];   // value of mean (mean value, default is 104.0,117.0,123.0)
+    float scale[3];  // value of normalize (scale value, default is 1.0,1.0,1.0)
+    int center_crop; // flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0)
     int letterbox_rows;
     int letterbox_cols;
-    int sw_RGB;             // flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1)
-    int focus;              // flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0)
-    int inplace;            // process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip
-    int algorithm_type;     // the type of quant algorithm(0:min-max, 1:kl, default is 0)
+    int sw_RGB;         // flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1)
+    int focus;          // flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0)
+    int inplace;        // process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip
+    int algorithm_type; // the type of quant algorithm(0:min-max, 1:kl, default is 0)
 };
diff --git a/tools/quantize/quant_tool_int8.cpp b/tools/quantize/quant_tool_int8.cpp
index 70bd39b07..009c6dd31 100644
--- a/tools/quantize/quant_tool_int8.cpp
+++ b/tools/quantize/quant_tool_int8.cpp
@@ -22,14 +22,12 @@
  * Author: hhchen@openailab.com
  */
 
-
 #include <algorithm>
 #include <cfloat>
 
 #include "quant_tool.hpp"
 #include "quant_save_graph.hpp"
 
-
 QuantTool::QuantTool()
 {
     // initial tengine
@@ -86,7 +84,7 @@ int QuantTool::activation_quant_tool()
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * img_c;
-    int dims[] = {1, img_c, img_h, img_w};    // nchw
+    int dims[] = {1, img_c, img_h, img_w}; // nchw
     std::vector<float> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(ir_graph, 0, 0);
@@ -114,7 +112,7 @@ int QuantTool::activation_quant_tool()
         struct tensor* var_tensor = ir_graph->tensor_list[i];
         if (var_tensor->tensor_type == TENSOR_TYPE_VAR)
         {
-            var_tensor->data = ( float* )malloc(sizeof(float));
+            var_tensor->data = (float*)malloc(sizeof(float));
         }
     }
 
@@ -168,7 +166,7 @@ int QuantTool::activation_quant_tool()
     double total_time = 0.;
     for (int nums = 0; nums < img_num; nums++)
     {
-        fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums+1, img_num);
+        fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums + 1, img_num);
         get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus);
 
         /* run graph */
@@ -191,8 +189,8 @@ int QuantTool::activation_quant_tool()
             struct tensor* act_tensor = ir_graph->tensor_list[i];
             if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT)
             {
-                float* start_addr = ( float* )act_tensor->data;
-                float* end_addr   = ( float* )act_tensor->data + act_tensor->elem_num;
+                float* start_addr = (float*)act_tensor->data;
+                float* end_addr = (float*)act_tensor->data + act_tensor->elem_num;
                 max_activation[i] = std::max(max_activation[i], *std::max_element(start_addr, end_addr));
                 min_activation[i] = std::min(min_activation[i], *std::min_element(start_addr, end_addr));
             }
@@ -231,14 +229,14 @@ int QuantTool::activation_quant_tool()
                 }
             }
 
-            fprintf(fp_minmax,"%s %f %d\n",ir_graph->tensor_list[i]->name, act_scale, act_zero_point);
+            fprintf(fp_minmax, "%s %f %d\n", ir_graph->tensor_list[i]->name, act_scale, act_zero_point);
         }
     }
     fclose(fp_minmax);
     fprintf(stderr, "\r\n[Quant Tools Info]: Step 1, find original calibration table done, output ./table_minmax.scale\n");
 
     if (this->algorithm_type == ALGORITHM_KL)
-    {   
+    {
         /* todo support */
     }
 
@@ -285,58 +283,58 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                quant_tool.model_file = optarg;
-                break;
-            case 'a':
-                quant_tool.algorithm_type = atoi(optarg);
-                break;
-            case 'f':
-                quant_tool.scale_file = optarg;
-                break;
-            case 'o':
-                quant_tool.output_file = optarg;
-                break;
-            case 'i':
-                quant_tool.image_dir = optarg;
-                break;
-            case 'g':
-                float img_chw[3];
-                split(img_chw, optarg, ",");
-                quant_tool.img_c = (int)img_chw[0];
-                quant_tool.img_h = (int)img_chw[1];
-                quant_tool.img_w = (int)img_chw[2];
-                break;
-            case 'w':
-                split(quant_tool.mean, optarg, ",");
-                break;
-            case 's':
-                split(quant_tool.scale, optarg, ",");
-                break;
-            case 'b':
-                quant_tool.sw_RGB = atoi(optarg);
-                break;
-            case 'c':
-                quant_tool.center_crop = atoi(optarg);
-                break;
-            case 'y':
-                float letterboxs[2];
-                split(letterboxs, optarg, ",");
-                quant_tool.letterbox_rows = (int)letterboxs[0];
-                quant_tool.letterbox_cols = (int)letterboxs[1];
-                break;
-            case 'k':
-                quant_tool.focus = atoi(optarg);
-                break;                
-            case 't':
-                quant_tool.num_thread = atoi(optarg);
-                quant_tool.opt.num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            quant_tool.model_file = optarg;
+            break;
+        case 'a':
+            quant_tool.algorithm_type = atoi(optarg);
+            break;
+        case 'f':
+            quant_tool.scale_file = optarg;
+            break;
+        case 'o':
+            quant_tool.output_file = optarg;
+            break;
+        case 'i':
+            quant_tool.image_dir = optarg;
+            break;
+        case 'g':
+            float img_chw[3];
+            split(img_chw, optarg, ",");
+            quant_tool.img_c = (int)img_chw[0];
+            quant_tool.img_h = (int)img_chw[1];
+            quant_tool.img_w = (int)img_chw[2];
+            break;
+        case 'w':
+            split(quant_tool.mean, optarg, ",");
+            break;
+        case 's':
+            split(quant_tool.scale, optarg, ",");
+            break;
+        case 'b':
+            quant_tool.sw_RGB = atoi(optarg);
+            break;
+        case 'c':
+            quant_tool.center_crop = atoi(optarg);
+            break;
+        case 'y':
+            float letterboxs[2];
+            split(letterboxs, optarg, ",");
+            quant_tool.letterbox_rows = (int)letterboxs[0];
+            quant_tool.letterbox_cols = (int)letterboxs[1];
+            break;
+        case 'k':
+            quant_tool.focus = atoi(optarg);
+            break;
+        case 't':
+            quant_tool.num_thread = atoi(optarg);
+            quant_tool.opt.num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -348,21 +346,21 @@ int main(int argc, char* argv[])
     /* check input params */
     if (quant_tool.model_file.empty())
     {
-        fprintf(stderr,"[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n");
+        fprintf(stderr, "[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n");
         show_usage();
         return -1;
     }
 
     if (quant_tool.image_dir.empty())
     {
-        fprintf(stderr,"[Quant Tools Info]: The input dir of Calibration image not specified!\n");
+        fprintf(stderr, "[Quant Tools Info]: The input dir of Calibration image not specified!\n");
         show_usage();
         return -1;
     }
 
     if (quant_tool.output_file.empty())
     {
-        fprintf(stderr,"[Quant Tools Info]: The output file of Int8 tmfile not specified!\n");
+        fprintf(stderr, "[Quant Tools Info]: The output file of Int8 tmfile not specified!\n");
         show_usage();
         return -1;
     }
@@ -371,15 +369,15 @@ int main(int argc, char* argv[])
     fprintf(stderr, "Input model : %s\n", quant_tool.model_file.c_str());
     fprintf(stderr, "Output model: %s\n", quant_tool.output_file.c_str());
     fprintf(stderr, "Calib images: %s\n", quant_tool.image_dir.c_str());
-    fprintf(stderr, "Scale file  : %s\n", quant_tool.scale_file.empty()?"NULL":quant_tool.scale_file.c_str());
-    fprintf(stderr, "Algorithm   : %s\n", quant_tool.algorithm_type?"KL":"MIN MAX");
+    fprintf(stderr, "Scale file  : %s\n", quant_tool.scale_file.empty() ? "NULL" : quant_tool.scale_file.c_str());
+    fprintf(stderr, "Algorithm   : %s\n", quant_tool.algorithm_type ? "KL" : "MIN MAX");
     fprintf(stderr, "Dims        : %d %d %d\n", quant_tool.img_c, quant_tool.img_h, quant_tool.img_w);
     fprintf(stderr, "Mean        : %.3f %.3f %.3f\n", quant_tool.mean[0], quant_tool.mean[1], quant_tool.mean[2]);
     fprintf(stderr, "Scale       : %.3f %.3f %.3f\n", quant_tool.scale[0], quant_tool.scale[1], quant_tool.scale[2]);
-    fprintf(stderr, "BGR2RGB     : %s\n", quant_tool.sw_RGB?"ON":"OFF");
-    fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop?"ON":"OFF");
+    fprintf(stderr, "BGR2RGB     : %s\n", quant_tool.sw_RGB ? "ON" : "OFF");
+    fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop ? "ON" : "OFF");
     fprintf(stderr, "Letter box  : %d %d\n", quant_tool.letterbox_rows, quant_tool.letterbox_cols);
-    fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus?"ON":"OFF");
+    fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF");
     fprintf(stderr, "Thread num  : %d\n\n", quant_tool.num_thread);
 
     /* using 3rd calibration table file */
@@ -387,19 +385,19 @@ int main(int argc, char* argv[])
     {
         /* quantize activation */
         quant_tool.activation_quant_tool();
-        
+
         /* select algorithm */
         if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX)
-            quant_tool.scale_file = "table_minmax.scale";         
+            quant_tool.scale_file = "table_minmax.scale";
         else
         {
-            fprintf(stderr,"[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n");
+            fprintf(stderr, "[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n");
             quant_tool.scale_file = "table_minmax.scale";
         }
     }
 
     /* quantize weight/bias and save into int8 tmfile */
-    fprintf(stderr,"[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str());
+    fprintf(stderr, "[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str());
     save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false);
 
     fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n");
diff --git a/tools/quantize/quant_tool_uint8.cpp b/tools/quantize/quant_tool_uint8.cpp
index 733086283..660981e1f 100644
--- a/tools/quantize/quant_tool_uint8.cpp
+++ b/tools/quantize/quant_tool_uint8.cpp
@@ -22,14 +22,12 @@
  * Author: hhchen@openailab.com
  */
 
-
 #include <algorithm>
 #include <cfloat>
 
 #include "quant_tool.hpp"
 #include "quant_save_graph.hpp"
 
-
 QuantTool::QuantTool()
 {
     // initial tengine
@@ -86,7 +84,7 @@ int QuantTool::activation_quant_tool()
 
     /* set the shape, data buffer of input_tensor of the graph */
     int img_size = img_h * img_w * img_c;
-    int dims[] = {1, img_c, img_h, img_w};    // nchw
+    int dims[] = {1, img_c, img_h, img_w}; // nchw
     std::vector<float> input_data(img_size);
 
     tensor_t input_tensor = get_graph_input_tensor(ir_graph, 0, 0);
@@ -114,7 +112,7 @@ int QuantTool::activation_quant_tool()
         struct tensor* var_tensor = ir_graph->tensor_list[i];
         if (var_tensor->tensor_type == TENSOR_TYPE_VAR)
         {
-            var_tensor->data = ( float* )malloc(sizeof(float));
+            var_tensor->data = (float*)malloc(sizeof(float));
         }
     }
 
@@ -168,7 +166,7 @@ int QuantTool::activation_quant_tool()
     double total_time = 0.;
     for (int nums = 0; nums < img_num; nums++)
     {
-        fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums+1, img_num);
+        fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums + 1, img_num);
         get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus);
 
         /* run graph */
@@ -191,8 +189,8 @@ int QuantTool::activation_quant_tool()
             struct tensor* act_tensor = ir_graph->tensor_list[i];
             if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT)
             {
-                float* start_addr = ( float* )act_tensor->data;
-                float* end_addr   = ( float* )act_tensor->data + act_tensor->elem_num;
+                float* start_addr = (float*)act_tensor->data;
+                float* end_addr = (float*)act_tensor->data + act_tensor->elem_num;
                 max_activation[i] = std::max(max_activation[i], *std::max_element(start_addr, end_addr));
                 min_activation[i] = std::min(min_activation[i], *std::min_element(start_addr, end_addr));
             }
@@ -216,14 +214,14 @@ int QuantTool::activation_quant_tool()
             else if (min_activation[i] > 0)
             {
                 act_scale = (max_activation[i] - 0) / 255;
-                act_zero_point = 0;                
+                act_zero_point = 0;
             }
             else
             {
                 act_scale = (max_activation[i] - min_activation[i]) / 255;
                 act_zero_point = int(-min_activation[i] / act_scale);
             }
- 
+
             if (act_scale == 0)
                 act_zero_point = 0;
 
@@ -260,13 +258,13 @@ int QuantTool::activation_quant_tool()
         fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table.\n");
         std::tr1::unordered_map<uint32_t, uint32_t> tensor_hist;
         std::tr1::unordered_map<uint32_t, uint32_t> hist_tensor;
-        std::vector<std::vector<float>> hist_edge;
-        std::vector<std::vector<uint32_t>> hist_gram;
+        std::vector<std::vector<float> > hist_edge;
+        std::vector<std::vector<uint32_t> > hist_gram;
 
         /* second loop, create histgram */
-        for (int nums = imgs_list.size()-1; nums >= 0; nums--)
+        for (int nums = imgs_list.size() - 1; nums >= 0; nums--)
         {
-            fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums+1, img_num);
+            fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums + 1, img_num);
 
             get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus);
 
@@ -296,12 +294,12 @@ int QuantTool::activation_quant_tool()
                             every_edge.push_back(edge_float);
                         }
                         hist_edge.push_back(every_edge);
-                        hist_gram.push_back(histCount(( float* )ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]));
+                        hist_gram.push_back(histCount((float*)ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]));
                     }
                     else
                     {
                         std::vector<uint32_t> hist_tmp;
-                        hist_tmp = histCount(( float* )ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]);
+                        hist_tmp = histCount((float*)ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]);
                         for (int j = 0; j < 2048; j++)
                         {
                             hist_gram[inum][j] += hist_tmp[j];
@@ -322,15 +320,15 @@ int QuantTool::activation_quant_tool()
         for (int i = 0; i < act_tensor_num; i++)
         {
             int threshold_bin = threshold_distribution(hist_gram[i], 256);
-    //        fprintf(stderr, " threshold_bin %d \n", threshold_bin);
+            //        fprintf(stderr, " threshold_bin %d \n", threshold_bin);
 
             std::vector<uint32_t> hist_gram_F(threshold_bin + 1);
-            for (int j = 0; j < threshold_bin+1; j++)
+            for (int j = 0; j < threshold_bin + 1; j++)
             {
                 hist_gram_F[j] = hist_gram[i][threshold_bin - j];
             }
             int threshold_bin_F = threshold_distribution(hist_gram_F, 256);
-            int threshold_bin_min = threshold_bin - threshold_bin_F + 1;   
+            int threshold_bin_min = threshold_bin - threshold_bin_F + 1;
 
             // fprintf(stderr, "### %s : %d   %f   %f & %f   %f\n",ir_graph->tensor_list[hist_tensor[i]]->name, threshold_bin, min_activation[hist_tensor[i]],\
             //                                        hist_edge[i][threshold_bin_min], hist_edge[i][threshold_bin],  max_activation[hist_tensor[i]]);
@@ -383,7 +381,7 @@ int QuantTool::activation_quant_tool()
             fprintf(fp_kl, "%s %f %d\n", ir_graph->tensor_list[hist_tensor[i]]->name, act_scale, act_zero_point);
         }
         fclose(fp_kl);
-        fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n");   
+        fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n");
     }
 
     fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num);
@@ -429,58 +427,58 @@ int main(int argc, char* argv[])
     {
         switch (res)
         {
-            case 'm':
-                quant_tool.model_file = optarg;
-                break;
-            case 'a':
-                quant_tool.algorithm_type = atoi(optarg);
-                break;
-            case 'f':
-                quant_tool.scale_file = optarg;
-                break;
-            case 'o':
-                quant_tool.output_file = optarg;
-                break;
-            case 'i':
-                quant_tool.image_dir = optarg;
-                break;
-            case 'g':
-                float img_chw[3];
-                split(img_chw, optarg, ",");
-                quant_tool.img_c = (int)img_chw[0];
-                quant_tool.img_h = (int)img_chw[1];
-                quant_tool.img_w = (int)img_chw[2];
-                break;
-            case 'w':
-                split(quant_tool.mean, optarg, ",");
-                break;
-            case 's':
-                split(quant_tool.scale, optarg, ",");
-                break;
-            case 'b':
-                quant_tool.sw_RGB = atoi(optarg);
-                break;
-            case 'c':
-                quant_tool.center_crop = atoi(optarg);
-                break;
-            case 'y':
-                float letterboxs[2];
-                split(letterboxs, optarg, ",");
-                quant_tool.letterbox_rows = (int)letterboxs[0];
-                quant_tool.letterbox_cols = (int)letterboxs[1];
-                break;
-            case 'k':
-                quant_tool.focus = atoi(optarg);
-                break;                
-            case 't':
-                quant_tool.num_thread = atoi(optarg);
-                quant_tool.opt.num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
+        case 'm':
+            quant_tool.model_file = optarg;
+            break;
+        case 'a':
+            quant_tool.algorithm_type = atoi(optarg);
+            break;
+        case 'f':
+            quant_tool.scale_file = optarg;
+            break;
+        case 'o':
+            quant_tool.output_file = optarg;
+            break;
+        case 'i':
+            quant_tool.image_dir = optarg;
+            break;
+        case 'g':
+            float img_chw[3];
+            split(img_chw, optarg, ",");
+            quant_tool.img_c = (int)img_chw[0];
+            quant_tool.img_h = (int)img_chw[1];
+            quant_tool.img_w = (int)img_chw[2];
+            break;
+        case 'w':
+            split(quant_tool.mean, optarg, ",");
+            break;
+        case 's':
+            split(quant_tool.scale, optarg, ",");
+            break;
+        case 'b':
+            quant_tool.sw_RGB = atoi(optarg);
+            break;
+        case 'c':
+            quant_tool.center_crop = atoi(optarg);
+            break;
+        case 'y':
+            float letterboxs[2];
+            split(letterboxs, optarg, ",");
+            quant_tool.letterbox_rows = (int)letterboxs[0];
+            quant_tool.letterbox_cols = (int)letterboxs[1];
+            break;
+        case 'k':
+            quant_tool.focus = atoi(optarg);
+            break;
+        case 't':
+            quant_tool.num_thread = atoi(optarg);
+            quant_tool.opt.num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
         }
     }
 
@@ -492,21 +490,21 @@ int main(int argc, char* argv[])
     /* check input params */
     if (quant_tool.model_file.empty())
     {
-        fprintf(stderr,"[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n");
+        fprintf(stderr, "[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n");
         show_usage();
         return -1;
     }
 
     if (quant_tool.image_dir.empty())
     {
-        fprintf(stderr,"[Quant Tools Info]: The input dir of Calibration image not specified!\n");
+        fprintf(stderr, "[Quant Tools Info]: The input dir of Calibration image not specified!\n");
         show_usage();
         return -1;
     }
 
     if (quant_tool.output_file.empty())
     {
-        fprintf(stderr,"[Quant Tools Info]: The output file of Int8 tmfile not specified!\n");
+        fprintf(stderr, "[Quant Tools Info]: The output file of Int8 tmfile not specified!\n");
         show_usage();
         return -1;
     }
@@ -515,15 +513,15 @@ int main(int argc, char* argv[])
     fprintf(stderr, "Input model : %s\n", quant_tool.model_file.c_str());
     fprintf(stderr, "Output model: %s\n", quant_tool.output_file.c_str());
     fprintf(stderr, "Calib images: %s\n", quant_tool.image_dir.c_str());
-    fprintf(stderr, "Scale file  : %s\n", quant_tool.scale_file.empty()?"NULL":quant_tool.scale_file.c_str());
-    fprintf(stderr, "Algorithm   : %s\n", quant_tool.algorithm_type?"KL":"MIN MAX");
+    fprintf(stderr, "Scale file  : %s\n", quant_tool.scale_file.empty() ? "NULL" : quant_tool.scale_file.c_str());
+    fprintf(stderr, "Algorithm   : %s\n", quant_tool.algorithm_type ? "KL" : "MIN MAX");
     fprintf(stderr, "Dims        : %d %d %d\n", quant_tool.img_c, quant_tool.img_h, quant_tool.img_w);
     fprintf(stderr, "Mean        : %.3f %.3f %.3f\n", quant_tool.mean[0], quant_tool.mean[1], quant_tool.mean[2]);
     fprintf(stderr, "Scale       : %.3f %.3f %.3f\n", quant_tool.scale[0], quant_tool.scale[1], quant_tool.scale[2]);
-    fprintf(stderr, "BGR2RGB     : %s\n", quant_tool.sw_RGB?"ON":"OFF");
-    fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop?"ON":"OFF");
+    fprintf(stderr, "BGR2RGB     : %s\n", quant_tool.sw_RGB ? "ON" : "OFF");
+    fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop ? "ON" : "OFF");
     fprintf(stderr, "Letter box  : %d %d\n", quant_tool.letterbox_rows, quant_tool.letterbox_cols);
-    fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus?"ON":"OFF");
+    fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF");
     fprintf(stderr, "Thread num  : %d\n\n", quant_tool.num_thread);
 
     /* using 3rd calibration table file */
@@ -531,21 +529,21 @@ int main(int argc, char* argv[])
     {
         /* quantize activation */
         quant_tool.activation_quant_tool();
-        
+
         /* select algorithm */
         if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX)
             quant_tool.scale_file = "table_minmax.scale";
-        else if  (quant_tool.algorithm_type == ALGORITHM_KL)
-            quant_tool.scale_file = "table_kl.scale";            
+        else if (quant_tool.algorithm_type == ALGORITHM_KL)
+            quant_tool.scale_file = "table_kl.scale";
         else
         {
-            fprintf(stderr,"[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n");
+            fprintf(stderr, "[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n");
             quant_tool.scale_file = "table_minmax.scale";
         }
     }
 
     /* quantize weight/bias and save into uint8 tmfile */
-    fprintf(stderr,"[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str());
+    fprintf(stderr, "[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str());
     save_graph_u8_perlayer(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false);
 
     fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your UInt8 inference has a low accuracy loss...\\(^0^)/ ----\n");
diff --git a/tools/quantize/quant_tool_uint8_perchannel.cpp b/tools/quantize/quant_tool_uint8_perchannel.cpp
index 14ece0fb9..a944cad9c 100644
--- a/tools/quantize/quant_tool_uint8_perchannel.cpp
+++ b/tools/quantize/quant_tool_uint8_perchannel.cpp
@@ -1,555 +1,552 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: hhchen@openailab.com
- */
-
-
-#include <algorithm>
-#include <cfloat>
-
-#include "quant_tool.hpp"
-#include "quant_save_graph.hpp"
-
-
-QuantTool::QuantTool()
-{
-    // initial tengine
-    if (init_tengine() != 0)
-    {
-        fprintf(stderr, "Initial tengine failed.\n");
-    }
-
-    // system variable
-    this->opt.num_thread = 4;
-    this->opt.cluster = TENGINE_CLUSTER_ALL;
-    this->opt.precision = TENGINE_MODE_FP32;
-    this->opt.affinity = 0;
-    this->num_thread = 4;
-
-    // input variable
-    this->sw_RGB = 1;
-    this->img_c = 3;
-    this->img_h = 224;
-    this->img_w = 224;
-    this->mean[0] = 104.f;
-    this->mean[1] = 117.f;
-    this->mean[2] = 123.f;
-    this->scale[0] = 1.f;
-    this->scale[1] = 1.f;
-    this->scale[2] = 1.f;
-    this->center_crop = 0;
-    this->letterbox_rows = 0;
-    this->letterbox_cols = 0;
-    this->focus = 0;
-    this->inplace = true;
-    this->algorithm_type = ALGORITHM_MIN_MAX;
-}
-
-QuantTool::~QuantTool()
-{
-    /* release tengine */
-    release_tengine();
-}
-
-int QuantTool::activation_quant_tool()
-{
-    fprintf(stderr, "[Quant Tools Info]: Step 0, load FP32 tmfile.\n");
-
-    /* create graph, load tengine model xxx.tmfile */
-    struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file.c_str());
-    if (nullptr == ir_graph)
-    {
-        fprintf(stderr, "Create graph failed.\n");
-        return -1;
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 0, load FP32 tmfile done.\n");
-
-    /* set the shape, data buffer of input_tensor of the graph */
-    int img_size = img_h * img_w * img_c;
-    int dims[] = {1, img_c, img_h, img_w};    // nchw
-    std::vector<float> input_data(img_size);
-
-    tensor_t input_tensor = get_graph_input_tensor(ir_graph, 0, 0);
-    if (input_tensor == nullptr)
-    {
-        fprintf(stderr, "Get input tensor failed\n");
-        return -1;
-    }
-
-    if (set_tensor_shape(input_tensor, dims, 4) < 0)
-    {
-        fprintf(stderr, "Set input tensor shape failed\n");
-        return -1;
-    }
-
-    if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0)
-    {
-        fprintf(stderr, "Set input tensor buffer failed\n");
-        return -1;
-    }
-
-    /* initial malloc the output tesnors date buffer of nodes in the graph, to disable the mem pool, before prerun */
-    for (int i = 0; i < ir_graph->tensor_num; i++)
-    {
-        struct tensor* var_tensor = ir_graph->tensor_list[i];
-        if (var_tensor->tensor_type == TENSOR_TYPE_VAR)
-        {
-            var_tensor->data = ( float* )malloc(sizeof(float));
-        }
-    }
-
-    /* prerun graph, set work options(num_thread, cluster, precision) */
-    if (prerun_graph_multithread(ir_graph, this->opt) < 0)
-    {
-        fprintf(stderr, "Prerun multithread graph failed.\n");
-        return -1;
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 0, load calibration image files.\n");
-
-    /* really malloc the output tesnors date buffer of nodes in the graph */
-    for (int i = 0; i < ir_graph->tensor_num; i++)
-    {
-        struct tensor* var_tensor = ir_graph->tensor_list[i];
-        if (var_tensor->tensor_type == TENSOR_TYPE_VAR)
-        {
-            var_tensor->data = realloc(var_tensor->data, sizeof(float) * var_tensor->elem_num);
-            memset(var_tensor->data, 0, sizeof(float) * var_tensor->elem_num);
-        }
-    }
-
-    /* read image list */
-    std::vector<std::string> imgs_list;
-    readFileList(image_dir, imgs_list);
-    uint32_t img_num = imgs_list.size();
-
-    fprintf(stderr, "[Quant Tools Info]: Step 0, load calibration image files done, image num is %d.\n", img_num);
-
-    /* init minmax */
-    std::unordered_map<int, float> max_activation;
-    std::unordered_map<int, float> min_activation;
-    uint32_t act_tensor_num = 0;
-    for (int i = 0; i < ir_graph->tensor_num; i++)
-    {
-        struct tensor* act_tensor = ir_graph->tensor_list[i];
-        if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT)
-        {
-            act_tensor_num++;
-            max_activation[i] = -FLT_MAX;
-            min_activation[i] = FLT_MAX;
-        }
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Step 1, find original calibration table.\n");
-
-    /* first loop, find the min/max value of every activation tensor of the graph */
-    double min_time = DBL_MAX;
-    double max_time = DBL_MIN;
-    double total_time = 0.;
-    for (int nums = 0; nums < img_num; nums++)
-    {
-        fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums+1, img_num);
-        get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus);
-
-        /* run graph */
-        double start = get_current_time();
-        if (run_graph(ir_graph, 1) < 0)
-        {
-            fprintf(stderr, "Run graph failed\n");
-            return -1;
-        }
-
-        double end = get_current_time();
-        double cur = end - start;
-        total_time += cur;
-        min_time = std::min(min_time, cur);
-        max_time = std::max(max_time, cur);
-
-        /* get the min/max value of activation tensor */
-        for (int i = 0; i < ir_graph->tensor_num; i++)
-        {
-            struct tensor* act_tensor = ir_graph->tensor_list[i];
-            if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT)
-            {
-                float* start_addr = ( float* )act_tensor->data;
-                float* end_addr   = ( float* )act_tensor->data + act_tensor->elem_num;
-                max_activation[i] = std::max(max_activation[i], *std::max_element(start_addr, end_addr));
-                min_activation[i] = std::min(min_activation[i], *std::min_element(start_addr, end_addr));
-            }
-        }
-    }
-
-    /* save the calibration file with min-max algorithm */
-    FILE* fp_minmax = fopen("table_minmax.scale", "wb");
-    for (int i = 0; i < ir_graph->tensor_num; i++)
-    {
-        struct tensor* t = ir_graph->tensor_list[i];
-        if (t->tensor_type == TENSOR_TYPE_VAR || t->tensor_type == TENSOR_TYPE_INPUT)
-        {
-            float act_scale;
-            int act_zero_point;
-            if (max_activation[i] < 0)
-            {
-                act_scale = (0 - min_activation[i]) / 255;
-                act_zero_point = int(-min_activation[i] / act_scale);
-            }
-            else if (min_activation[i] > 0)
-            {
-                act_scale = (max_activation[i] - 0) / 255;
-                act_zero_point = 0;
-            }
-            else
-            {
-                act_scale = (max_activation[i] - min_activation[i]) / 255;
-                act_zero_point = int(-min_activation[i] / act_scale);
-            }
-
-            if (act_scale == 0)
-                act_zero_point = 0;
-
-            /* the scale of softmax always is scale = 1 / 127.f */
-            for (int j = 0; j < ir_graph->node_num; j++)
-            {
-                struct node* noden = ir_graph->node_list[j];
-                struct tensor* tensor_tmp = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
-
-                if (!(tensor_tmp->tensor_type == TENSOR_TYPE_INPUT || tensor_tmp->tensor_type == TENSOR_TYPE_VAR))
-                    continue;
-
-                std::string tmp_op_name = get_op_name_from_type(noden->op.type);
-                std::string cur_name = t->name;
-                std::string tmp_name = tensor_tmp->name;
-
-                if ((cur_name == tmp_name) && tmp_op_name == "Softmax")
-                {
-                    act_scale = 1 / 255.f;
-                    act_zero_point = 0;
-                    break;
-                }
-            }
-
-            fprintf(fp_minmax, "%s %f %d\n", ir_graph->tensor_list[i]->name, act_scale, act_zero_point);
-        }
-    }
-    fclose(fp_minmax);
-    fprintf(stderr, "\r\n[Quant Tools Info]: Step 1, find original calibration table done, output ./table_minmax.scale\n");
-
-    if (this->algorithm_type == ALGORITHM_KL)
-    {
-        /* kl process divergence */
-        fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table.\n");
-        std::tr1::unordered_map<uint32_t, uint32_t> tensor_hist;
-        std::tr1::unordered_map<uint32_t, uint32_t> hist_tensor;
-        std::vector<std::vector<float>> hist_edge;
-        std::vector<std::vector<uint32_t>> hist_gram;
-
-        /* second loop, create histgram */
-        for (int nums = imgs_list.size()-1; nums >= 0; nums--)
-        {
-            fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums+1, img_num);
-
-            get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus);
-
-            /* run graph */
-            if (run_graph(ir_graph, 1) < 0)
-            {
-                fprintf(stderr, "Run graph failed\n");
-                return -1;
-            }
-
-            /* calculate hist */
-            uint32_t inum = 0;
-            for (int i = 0; i < ir_graph->tensor_num; i++)
-            {
-                struct tensor* ir_tensor = ir_graph->tensor_list[i];
-                if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
-                {
-                    float step_max = max_activation[i] - min_activation[i];
-                    float step_bin = step_max / 2048.0f;
-
-                    std::vector<float> every_edge;
-                    if (nums == imgs_list.size() - 1)
-                    {
-                        for (int j = 0; j < 2048; j++)
-                        {
-                            float edge_float = (step_bin * (j + 0.5f)) + min_activation[i];
-                            every_edge.push_back(edge_float);
-                        }
-                        hist_edge.push_back(every_edge);
-                        hist_gram.push_back(histCount(( float* )ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]));
-                    }
-                    else
-                    {
-                        std::vector<uint32_t> hist_tmp;
-                        hist_tmp = histCount(( float* )ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]);
-                        for (int j = 0; j < 2048; j++)
-                        {
-                            hist_gram[inum][j] += hist_tmp[j];
-                        }
-                    }
-
-                    tensor_hist[i] = inum;
-                    hist_tensor[inum] = i;
-                    inum++;
-                }
-            }
-        }
-
-        fprintf(stderr, "\n");
-
-        /* save the calibration file with min-max algorithm with kl divergence */
-        FILE* fp_kl = fopen("table_kl.scale", "wb");
-        for (int i = 0; i < act_tensor_num; i++)
-        {
-            int threshold_bin = threshold_distribution(hist_gram[i], 256);
-            //        fprintf(stderr, " threshold_bin %d \n", threshold_bin);
-
-            std::vector<uint32_t> hist_gram_F(threshold_bin + 1);
-            for (int j = 0; j < threshold_bin+1; j++)
-            {
-                hist_gram_F[j] = hist_gram[i][threshold_bin - j];
-            }
-            int threshold_bin_F = threshold_distribution(hist_gram_F, 256);
-            int threshold_bin_min = threshold_bin - threshold_bin_F + 1;
-
-            // fprintf(stderr, "### %s : %d   %f   %f & %f   %f\n",ir_graph->tensor_list[hist_tensor[i]]->name, threshold_bin, min_activation[hist_tensor[i]],\
-            //                                        hist_edge[i][threshold_bin_min], hist_edge[i][threshold_bin],  max_activation[hist_tensor[i]]);
-
-            float kl_min = hist_edge[i][threshold_bin_min];
-            float kl_max = hist_edge[i][threshold_bin];
-
-            float act_scale = 1.0f;
-            int act_zero_point = 0;
-            if (kl_max < 0)
-            {
-                act_scale = (0 - kl_min) / 255.f;
-                act_zero_point = int(-kl_min / act_scale);
-            }
-            else if (kl_min > 0)
-            {
-                act_scale = (kl_max - 0) / 255.f;
-                act_zero_point = 0;
-            }
-            else
-            {
-                act_scale = (kl_max - kl_min) / 255.f;
-                act_zero_point = int(-kl_min / act_scale);
-            }
-
-            if (act_scale == 0)
-                act_zero_point = 0;
-
-            /* the scale of softmax always is scale = 1 / 255.f */
-            for (int j = 0; j < ir_graph->node_num; j++)
-            {
-                struct node* ir_node = ir_graph->node_list[j];
-                struct tensor* ir_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-
-                if (!(ir_tensor->tensor_type == TENSOR_TYPE_INPUT || ir_tensor->tensor_type == TENSOR_TYPE_VAR))
-                    continue;
-
-                std::string tmp_op_name = get_op_name_from_type(ir_node->op.type);
-                std::string cur_name = ir_graph->tensor_list[hist_tensor[i]]->name;
-                std::string tmp_name = ir_tensor->name;
-
-                if ((cur_name == tmp_name) && tmp_op_name == "Softmax")
-                {
-                    act_scale = 1 / 255.f;
-                    act_zero_point = 0;
-                    break;
-                }
-            }
-
-            fprintf(fp_kl, "%s %f %d\n", ir_graph->tensor_list[hist_tensor[i]]->name, act_scale, act_zero_point);
-        }
-        fclose(fp_kl);
-        fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n");
-    }
-
-    fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num);
-
-    /* release tengine */
-    postrun_graph(ir_graph);
-    destroy_graph(ir_graph);
-
-    return 0;
-}
-
-const char* help_params = "[Quant Tools Info]: optional arguments:\n"
-                          "\t-h    help            show this help message and exit\n"
-                          "\t-m    input model     path to input float32 tmfile\n"
-                          "\t-i    image dir       path to calibration images folder\n"
-                          "\t-f    scale file      path to calibration scale file\n"
-                          "\t-o    output model    path to output uint8 tmfile\n"
-                          "\t-a    algorithm       the type of quant algorithm(0:min-max, 1:kl, default is 0)\n"
-                          "\t-g    size            the size of input image(using the resize the original image,default is 3,224,224)\n"
-                          "\t-w    mean            value of mean (mean value, default is 104.0,117.0,123.0)\n"
-                          "\t-s    scale           value of normalize (scale value, default is 1.0,1.0,1.0)\n"
-                          "\t-b    swapRB          flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1)\n"
-                          "\t-c    center crop     flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0)\n"
-                          "\t-y    letter box      the size of letter box process image is necessary([rows, cols], default is [0, 0])\n"
-                          "\t-k    focus           flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0)\n"
-                          "\t-t    num thread      count of processing threads(default is 1)\n";
-
-const char* example_params = "[Quant Tools Info]: example arguments:\n"
-                             "\t./quant_tool_uint8 -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_uint8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017\n";
-
-void show_usage()
-{
-    fprintf(stderr, "%s\n", help_params);
-    fprintf(stderr, "%s\n", example_params);
-}
-
-int main(int argc, char* argv[])
-{
-    QuantTool quant_tool;
-
-    int res;
-    while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:t:h")) != -1)
-    {
-        switch (res)
-        {
-            case 'm':
-                quant_tool.model_file = optarg;
-                break;
-            case 'a':
-                quant_tool.algorithm_type = atoi(optarg);
-                break;
-            case 'f':
-                quant_tool.scale_file = optarg;
-                break;
-            case 'o':
-                quant_tool.output_file = optarg;
-                break;
-            case 'i':
-                quant_tool.image_dir = optarg;
-                break;
-            case 'g':
-                float img_chw[3];
-                split(img_chw, optarg, ",");
-                quant_tool.img_c = (int)img_chw[0];
-                quant_tool.img_h = (int)img_chw[1];
-                quant_tool.img_w = (int)img_chw[2];
-                break;
-            case 'w':
-                split(quant_tool.mean, optarg, ",");
-                break;
-            case 's':
-                split(quant_tool.scale, optarg, ",");
-                break;
-            case 'b':
-                quant_tool.sw_RGB = atoi(optarg);
-                break;
-            case 'c':
-                quant_tool.center_crop = atoi(optarg);
-                break;
-            case 'y':
-                float letterboxs[2];
-                split(letterboxs, optarg, ",");
-                quant_tool.letterbox_rows = (int)letterboxs[0];
-                quant_tool.letterbox_cols = (int)letterboxs[1];
-                break;
-            case 'k':
-                quant_tool.focus = atoi(optarg);
-                break;
-            case 't':
-                quant_tool.num_thread = atoi(optarg);
-                quant_tool.opt.num_thread = atoi(optarg);
-                break;
-            case 'h':
-                show_usage();
-                return 0;
-            default:
-                break;
-        }
-    }
-
-    /* version */
-    fprintf(stderr, "\n---- Tengine Post Training Quantization Tool ---- \n");
-    fprintf(stderr, "\nVersion     : v1.2, %s %s\n", __TIME__, __DATE__);
-    fprintf(stderr, "Status      : uint8, per-channel, asymmetric\n");
-
-    /* check input params */
-    if (quant_tool.model_file.empty())
-    {
-        fprintf(stderr,"[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n");
-        show_usage();
-        return -1;
-    }
-
-    if (quant_tool.image_dir.empty())
-    {
-        fprintf(stderr,"[Quant Tools Info]: The input dir of Calibration image not specified!\n");
-        show_usage();
-        return -1;
-    }
-
-    if (quant_tool.output_file.empty())
-    {
-        fprintf(stderr,"[Quant Tools Info]: The output file of Int8 tmfile not specified!\n");
-        show_usage();
-        return -1;
-    }
-
-    /* debug info : input params */
-    fprintf(stderr, "Input model : %s\n", quant_tool.model_file.c_str());
-    fprintf(stderr, "Output model: %s\n", quant_tool.output_file.c_str());
-    fprintf(stderr, "Calib images: %s\n", quant_tool.image_dir.c_str());
-    fprintf(stderr, "Scale file  : %s\n", quant_tool.scale_file.empty()?"NULL":quant_tool.scale_file.c_str());
-    fprintf(stderr, "Algorithm   : %s\n", quant_tool.algorithm_type?"KL":"MIN MAX");
-    fprintf(stderr, "Dims        : %d %d %d\n", quant_tool.img_c, quant_tool.img_h, quant_tool.img_w);
-    fprintf(stderr, "Mean        : %.3f %.3f %.3f\n", quant_tool.mean[0], quant_tool.mean[1], quant_tool.mean[2]);
-    fprintf(stderr, "Scale       : %.3f %.3f %.3f\n", quant_tool.scale[0], quant_tool.scale[1], quant_tool.scale[2]);
-    fprintf(stderr, "BGR2RGB     : %s\n", quant_tool.sw_RGB?"ON":"OFF");
-    fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop?"ON":"OFF");
-    fprintf(stderr, "Letter box  : %d %d\n", quant_tool.letterbox_rows, quant_tool.letterbox_cols);
-    fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus?"ON":"OFF");
-    fprintf(stderr, "Thread num  : %d\n\n", quant_tool.num_thread);
-
-
-    /* using 3rd calibration table file */
-    if (quant_tool.scale_file.empty())
-    {
-        /* quantize activation */
-        quant_tool.activation_quant_tool();
-
-        /* select algorithm */
-        if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX)
-            quant_tool.scale_file = "table_minmax.scale";
-        else if  (quant_tool.algorithm_type == ALGORITHM_KL)
-            quant_tool.scale_file = "table_kl.scale";
-        else
-        {
-            fprintf(stderr,"[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n");
-            quant_tool.scale_file = "table_minmax.scale";
-        }
-    }
-
-    /* quantize weight/bias and save into uint8 tmfile */
-    fprintf(stderr,"[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str());
-    save_graph_u8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false);
-
-    fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n");
-
-    return 0;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include <algorithm>
+#include <cfloat>
+
+#include "quant_tool.hpp"
+#include "quant_save_graph.hpp"
+
+QuantTool::QuantTool()
+{
+    // initial tengine
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+    }
+
+    // system variable
+    this->opt.num_thread = 4;
+    this->opt.cluster = TENGINE_CLUSTER_ALL;
+    this->opt.precision = TENGINE_MODE_FP32;
+    this->opt.affinity = 0;
+    this->num_thread = 4;
+
+    // input variable
+    this->sw_RGB = 1;
+    this->img_c = 3;
+    this->img_h = 224;
+    this->img_w = 224;
+    this->mean[0] = 104.f;
+    this->mean[1] = 117.f;
+    this->mean[2] = 123.f;
+    this->scale[0] = 1.f;
+    this->scale[1] = 1.f;
+    this->scale[2] = 1.f;
+    this->center_crop = 0;
+    this->letterbox_rows = 0;
+    this->letterbox_cols = 0;
+    this->focus = 0;
+    this->inplace = true;
+    this->algorithm_type = ALGORITHM_MIN_MAX;
+}
+
+QuantTool::~QuantTool()
+{
+    /* release tengine */
+    release_tengine();
+}
+
+int QuantTool::activation_quant_tool()
+{
+    fprintf(stderr, "[Quant Tools Info]: Step 0, load FP32 tmfile.\n");
+
+    /* create graph, load tengine model xxx.tmfile */
+    struct graph* ir_graph = (struct graph*)create_graph(nullptr, "tengine", model_file.c_str());
+    if (nullptr == ir_graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 0, load FP32 tmfile done.\n");
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    int img_size = img_h * img_w * img_c;
+    int dims[] = {1, img_c, img_h, img_w}; // nchw
+    std::vector<float> input_data(img_size);
+
+    tensor_t input_tensor = get_graph_input_tensor(ir_graph, 0, 0);
+    if (input_tensor == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor, input_data.data(), img_size * sizeof(float)) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* initial malloc the output tesnors date buffer of nodes in the graph, to disable the mem pool, before prerun */
+    for (int i = 0; i < ir_graph->tensor_num; i++)
+    {
+        struct tensor* var_tensor = ir_graph->tensor_list[i];
+        if (var_tensor->tensor_type == TENSOR_TYPE_VAR)
+        {
+            var_tensor->data = (float*)malloc(sizeof(float));
+        }
+    }
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(ir_graph, this->opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 0, load calibration image files.\n");
+
+    /* really malloc the output tesnors date buffer of nodes in the graph */
+    for (int i = 0; i < ir_graph->tensor_num; i++)
+    {
+        struct tensor* var_tensor = ir_graph->tensor_list[i];
+        if (var_tensor->tensor_type == TENSOR_TYPE_VAR)
+        {
+            var_tensor->data = realloc(var_tensor->data, sizeof(float) * var_tensor->elem_num);
+            memset(var_tensor->data, 0, sizeof(float) * var_tensor->elem_num);
+        }
+    }
+
+    /* read image list */
+    std::vector<std::string> imgs_list;
+    readFileList(image_dir, imgs_list);
+    uint32_t img_num = imgs_list.size();
+
+    fprintf(stderr, "[Quant Tools Info]: Step 0, load calibration image files done, image num is %d.\n", img_num);
+
+    /* init minmax */
+    std::unordered_map<int, float> max_activation;
+    std::unordered_map<int, float> min_activation;
+    uint32_t act_tensor_num = 0;
+    for (int i = 0; i < ir_graph->tensor_num; i++)
+    {
+        struct tensor* act_tensor = ir_graph->tensor_list[i];
+        if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT)
+        {
+            act_tensor_num++;
+            max_activation[i] = -FLT_MAX;
+            min_activation[i] = FLT_MAX;
+        }
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Step 1, find original calibration table.\n");
+
+    /* first loop, find the min/max value of every activation tensor of the graph */
+    double min_time = DBL_MAX;
+    double max_time = DBL_MIN;
+    double total_time = 0.;
+    for (int nums = 0; nums < img_num; nums++)
+    {
+        fprintf(stderr, "\r[Quant Tools Info]: Step 1, images %.5d / %.5d", nums + 1, img_num);
+        get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus);
+
+        /* run graph */
+        double start = get_current_time();
+        if (run_graph(ir_graph, 1) < 0)
+        {
+            fprintf(stderr, "Run graph failed\n");
+            return -1;
+        }
+
+        double end = get_current_time();
+        double cur = end - start;
+        total_time += cur;
+        min_time = std::min(min_time, cur);
+        max_time = std::max(max_time, cur);
+
+        /* get the min/max value of activation tensor */
+        for (int i = 0; i < ir_graph->tensor_num; i++)
+        {
+            struct tensor* act_tensor = ir_graph->tensor_list[i];
+            if (act_tensor->tensor_type == TENSOR_TYPE_VAR || act_tensor->tensor_type == TENSOR_TYPE_INPUT)
+            {
+                float* start_addr = (float*)act_tensor->data;
+                float* end_addr = (float*)act_tensor->data + act_tensor->elem_num;
+                max_activation[i] = std::max(max_activation[i], *std::max_element(start_addr, end_addr));
+                min_activation[i] = std::min(min_activation[i], *std::min_element(start_addr, end_addr));
+            }
+        }
+    }
+
+    /* save the calibration file with min-max algorithm */
+    FILE* fp_minmax = fopen("table_minmax.scale", "wb");
+    for (int i = 0; i < ir_graph->tensor_num; i++)
+    {
+        struct tensor* t = ir_graph->tensor_list[i];
+        if (t->tensor_type == TENSOR_TYPE_VAR || t->tensor_type == TENSOR_TYPE_INPUT)
+        {
+            float act_scale;
+            int act_zero_point;
+            if (max_activation[i] < 0)
+            {
+                act_scale = (0 - min_activation[i]) / 255;
+                act_zero_point = int(-min_activation[i] / act_scale);
+            }
+            else if (min_activation[i] > 0)
+            {
+                act_scale = (max_activation[i] - 0) / 255;
+                act_zero_point = 0;
+            }
+            else
+            {
+                act_scale = (max_activation[i] - min_activation[i]) / 255;
+                act_zero_point = int(-min_activation[i] / act_scale);
+            }
+
+            if (act_scale == 0)
+                act_zero_point = 0;
+
+            /* the scale of softmax always is scale = 1 / 127.f */
+            for (int j = 0; j < ir_graph->node_num; j++)
+            {
+                struct node* noden = ir_graph->node_list[j];
+                struct tensor* tensor_tmp = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
+
+                if (!(tensor_tmp->tensor_type == TENSOR_TYPE_INPUT || tensor_tmp->tensor_type == TENSOR_TYPE_VAR))
+                    continue;
+
+                std::string tmp_op_name = get_op_name_from_type(noden->op.type);
+                std::string cur_name = t->name;
+                std::string tmp_name = tensor_tmp->name;
+
+                if ((cur_name == tmp_name) && tmp_op_name == "Softmax")
+                {
+                    act_scale = 1 / 255.f;
+                    act_zero_point = 0;
+                    break;
+                }
+            }
+
+            fprintf(fp_minmax, "%s %f %d\n", ir_graph->tensor_list[i]->name, act_scale, act_zero_point);
+        }
+    }
+    fclose(fp_minmax);
+    fprintf(stderr, "\r\n[Quant Tools Info]: Step 1, find original calibration table done, output ./table_minmax.scale\n");
+
+    if (this->algorithm_type == ALGORITHM_KL)
+    {
+        /* kl process divergence */
+        fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table.\n");
+        std::tr1::unordered_map<uint32_t, uint32_t> tensor_hist;
+        std::tr1::unordered_map<uint32_t, uint32_t> hist_tensor;
+        std::vector<std::vector<float> > hist_edge;
+        std::vector<std::vector<uint32_t> > hist_gram;
+
+        /* second loop, create histgram */
+        for (int nums = imgs_list.size() - 1; nums >= 0; nums--)
+        {
+            fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums + 1, img_num);
+
+            get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus);
+
+            /* run graph */
+            if (run_graph(ir_graph, 1) < 0)
+            {
+                fprintf(stderr, "Run graph failed\n");
+                return -1;
+            }
+
+            /* calculate hist */
+            uint32_t inum = 0;
+            for (int i = 0; i < ir_graph->tensor_num; i++)
+            {
+                struct tensor* ir_tensor = ir_graph->tensor_list[i];
+                if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+                {
+                    float step_max = max_activation[i] - min_activation[i];
+                    float step_bin = step_max / 2048.0f;
+
+                    std::vector<float> every_edge;
+                    if (nums == imgs_list.size() - 1)
+                    {
+                        for (int j = 0; j < 2048; j++)
+                        {
+                            float edge_float = (step_bin * (j + 0.5f)) + min_activation[i];
+                            every_edge.push_back(edge_float);
+                        }
+                        hist_edge.push_back(every_edge);
+                        hist_gram.push_back(histCount((float*)ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]));
+                    }
+                    else
+                    {
+                        std::vector<uint32_t> hist_tmp;
+                        hist_tmp = histCount((float*)ir_tensor->data, ir_tensor->elem_num, max_activation[i], min_activation[i]);
+                        for (int j = 0; j < 2048; j++)
+                        {
+                            hist_gram[inum][j] += hist_tmp[j];
+                        }
+                    }
+
+                    tensor_hist[i] = inum;
+                    hist_tensor[inum] = i;
+                    inum++;
+                }
+            }
+        }
+
+        fprintf(stderr, "\n");
+
+        /* save the calibration file with min-max algorithm with kl divergence */
+        FILE* fp_kl = fopen("table_kl.scale", "wb");
+        for (int i = 0; i < act_tensor_num; i++)
+        {
+            int threshold_bin = threshold_distribution(hist_gram[i], 256);
+            //        fprintf(stderr, " threshold_bin %d \n", threshold_bin);
+
+            std::vector<uint32_t> hist_gram_F(threshold_bin + 1);
+            for (int j = 0; j < threshold_bin + 1; j++)
+            {
+                hist_gram_F[j] = hist_gram[i][threshold_bin - j];
+            }
+            int threshold_bin_F = threshold_distribution(hist_gram_F, 256);
+            int threshold_bin_min = threshold_bin - threshold_bin_F + 1;
+
+            // fprintf(stderr, "### %s : %d   %f   %f & %f   %f\n",ir_graph->tensor_list[hist_tensor[i]]->name, threshold_bin, min_activation[hist_tensor[i]],\
+            //                                        hist_edge[i][threshold_bin_min], hist_edge[i][threshold_bin],  max_activation[hist_tensor[i]]);
+
+            float kl_min = hist_edge[i][threshold_bin_min];
+            float kl_max = hist_edge[i][threshold_bin];
+
+            float act_scale = 1.0f;
+            int act_zero_point = 0;
+            if (kl_max < 0)
+            {
+                act_scale = (0 - kl_min) / 255.f;
+                act_zero_point = int(-kl_min / act_scale);
+            }
+            else if (kl_min > 0)
+            {
+                act_scale = (kl_max - 0) / 255.f;
+                act_zero_point = 0;
+            }
+            else
+            {
+                act_scale = (kl_max - kl_min) / 255.f;
+                act_zero_point = int(-kl_min / act_scale);
+            }
+
+            if (act_scale == 0)
+                act_zero_point = 0;
+
+            /* the scale of softmax always is scale = 1 / 255.f */
+            for (int j = 0; j < ir_graph->node_num; j++)
+            {
+                struct node* ir_node = ir_graph->node_list[j];
+                struct tensor* ir_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+
+                if (!(ir_tensor->tensor_type == TENSOR_TYPE_INPUT || ir_tensor->tensor_type == TENSOR_TYPE_VAR))
+                    continue;
+
+                std::string tmp_op_name = get_op_name_from_type(ir_node->op.type);
+                std::string cur_name = ir_graph->tensor_list[hist_tensor[i]]->name;
+                std::string tmp_name = ir_tensor->name;
+
+                if ((cur_name == tmp_name) && tmp_op_name == "Softmax")
+                {
+                    act_scale = 1 / 255.f;
+                    act_zero_point = 0;
+                    break;
+                }
+            }
+
+            fprintf(fp_kl, "%s %f %d\n", ir_graph->tensor_list[hist_tensor[i]]->name, act_scale, act_zero_point);
+        }
+        fclose(fp_kl);
+        fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n");
+    }
+
+    fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num);
+
+    /* release tengine */
+    postrun_graph(ir_graph);
+    destroy_graph(ir_graph);
+
+    return 0;
+}
+
+const char* help_params = "[Quant Tools Info]: optional arguments:\n"
+                          "\t-h    help            show this help message and exit\n"
+                          "\t-m    input model     path to input float32 tmfile\n"
+                          "\t-i    image dir       path to calibration images folder\n"
+                          "\t-f    scale file      path to calibration scale file\n"
+                          "\t-o    output model    path to output uint8 tmfile\n"
+                          "\t-a    algorithm       the type of quant algorithm(0:min-max, 1:kl, default is 0)\n"
+                          "\t-g    size            the size of input image(using the resize the original image,default is 3,224,224)\n"
+                          "\t-w    mean            value of mean (mean value, default is 104.0,117.0,123.0)\n"
+                          "\t-s    scale           value of normalize (scale value, default is 1.0,1.0,1.0)\n"
+                          "\t-b    swapRB          flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1)\n"
+                          "\t-c    center crop     flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0)\n"
+                          "\t-y    letter box      the size of letter box process image is necessary([rows, cols], default is [0, 0])\n"
+                          "\t-k    focus           flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0)\n"
+                          "\t-t    num thread      count of processing threads(default is 1)\n";
+
+const char* example_params = "[Quant Tools Info]: example arguments:\n"
+                             "\t./quant_tool_uint8 -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_uint8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017\n";
+
+void show_usage()
+{
+    fprintf(stderr, "%s\n", help_params);
+    fprintf(stderr, "%s\n", example_params);
+}
+
+int main(int argc, char* argv[])
+{
+    QuantTool quant_tool;
+
+    int res;
+    while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:t:h")) != -1)
+    {
+        switch (res)
+        {
+        case 'm':
+            quant_tool.model_file = optarg;
+            break;
+        case 'a':
+            quant_tool.algorithm_type = atoi(optarg);
+            break;
+        case 'f':
+            quant_tool.scale_file = optarg;
+            break;
+        case 'o':
+            quant_tool.output_file = optarg;
+            break;
+        case 'i':
+            quant_tool.image_dir = optarg;
+            break;
+        case 'g':
+            float img_chw[3];
+            split(img_chw, optarg, ",");
+            quant_tool.img_c = (int)img_chw[0];
+            quant_tool.img_h = (int)img_chw[1];
+            quant_tool.img_w = (int)img_chw[2];
+            break;
+        case 'w':
+            split(quant_tool.mean, optarg, ",");
+            break;
+        case 's':
+            split(quant_tool.scale, optarg, ",");
+            break;
+        case 'b':
+            quant_tool.sw_RGB = atoi(optarg);
+            break;
+        case 'c':
+            quant_tool.center_crop = atoi(optarg);
+            break;
+        case 'y':
+            float letterboxs[2];
+            split(letterboxs, optarg, ",");
+            quant_tool.letterbox_rows = (int)letterboxs[0];
+            quant_tool.letterbox_cols = (int)letterboxs[1];
+            break;
+        case 'k':
+            quant_tool.focus = atoi(optarg);
+            break;
+        case 't':
+            quant_tool.num_thread = atoi(optarg);
+            quant_tool.opt.num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
+        }
+    }
+
+    /* version */
+    fprintf(stderr, "\n---- Tengine Post Training Quantization Tool ---- \n");
+    fprintf(stderr, "\nVersion     : v1.2, %s %s\n", __TIME__, __DATE__);
+    fprintf(stderr, "Status      : uint8, per-channel, asymmetric\n");
+
+    /* check input params */
+    if (quant_tool.model_file.empty())
+    {
+        fprintf(stderr, "[Quant Tools Info]: The input file of Float32 tmfile file not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (quant_tool.image_dir.empty())
+    {
+        fprintf(stderr, "[Quant Tools Info]: The input dir of Calibration image not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (quant_tool.output_file.empty())
+    {
+        fprintf(stderr, "[Quant Tools Info]: The output file of Int8 tmfile not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    /* debug info : input params */
+    fprintf(stderr, "Input model : %s\n", quant_tool.model_file.c_str());
+    fprintf(stderr, "Output model: %s\n", quant_tool.output_file.c_str());
+    fprintf(stderr, "Calib images: %s\n", quant_tool.image_dir.c_str());
+    fprintf(stderr, "Scale file  : %s\n", quant_tool.scale_file.empty() ? "NULL" : quant_tool.scale_file.c_str());
+    fprintf(stderr, "Algorithm   : %s\n", quant_tool.algorithm_type ? "KL" : "MIN MAX");
+    fprintf(stderr, "Dims        : %d %d %d\n", quant_tool.img_c, quant_tool.img_h, quant_tool.img_w);
+    fprintf(stderr, "Mean        : %.3f %.3f %.3f\n", quant_tool.mean[0], quant_tool.mean[1], quant_tool.mean[2]);
+    fprintf(stderr, "Scale       : %.3f %.3f %.3f\n", quant_tool.scale[0], quant_tool.scale[1], quant_tool.scale[2]);
+    fprintf(stderr, "BGR2RGB     : %s\n", quant_tool.sw_RGB ? "ON" : "OFF");
+    fprintf(stderr, "Center crop : %s\n", quant_tool.center_crop ? "ON" : "OFF");
+    fprintf(stderr, "Letter box  : %d %d\n", quant_tool.letterbox_rows, quant_tool.letterbox_cols);
+    fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF");
+    fprintf(stderr, "Thread num  : %d\n\n", quant_tool.num_thread);
+
+    /* using 3rd calibration table file */
+    if (quant_tool.scale_file.empty())
+    {
+        /* quantize activation */
+        quant_tool.activation_quant_tool();
+
+        /* select algorithm */
+        if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX)
+            quant_tool.scale_file = "table_minmax.scale";
+        else if (quant_tool.algorithm_type == ALGORITHM_KL)
+            quant_tool.scale_file = "table_kl.scale";
+        else
+        {
+            fprintf(stderr, "[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n");
+            quant_tool.scale_file = "table_minmax.scale";
+        }
+    }
+
+    /* quantize weight/bias and save into uint8 tmfile */
+    fprintf(stderr, "[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str());
+    save_graph_u8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false);
+
+    fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n");
+
+    return 0;
+}
diff --git a/tools/quantize/quant_utils.cpp b/tools/quantize/quant_utils.cpp
index ff4c72662..7cf67daf9 100644
--- a/tools/quantize/quant_utils.cpp
+++ b/tools/quantize/quant_utils.cpp
@@ -1,550 +1,543 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: hhchen@openailab.com
- */
-
-
-#include <dirent.h>
-#include <string.h>
-
-#include <opencv2/core/core.hpp>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-
-#ifdef _MSC_VER
-#include "getopt.h"
-#else
-#include <unistd.h>
-#endif
-
-#ifdef _WIN32
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-#else    // _WIN32
-#include <sys/time.h>
-#endif    // _WIN32
-
-#include "quant_utils.hpp"
-
-
-#ifdef _WIN32
-double get_current_time()
-{
-    LARGE_INTEGER freq;
-    LARGE_INTEGER pc;
-    QueryPerformanceFrequency(&freq);
-    QueryPerformanceCounter(&pc);
-
-    return pc.QuadPart * 1000.0 / freq.QuadPart;
-}
-#else    // _WIN32
-
-double get_current_time()
-{
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-
-    return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
-}
-#endif    // _WIN32
-
-void split(float* array, char* str, const char* del)
-{
-    char* s = nullptr;
-    s = strtok(str, del);
-    while (s != nullptr)
-    {
-        *array++ = atof(s);
-        s = strtok(nullptr, del);
-    }
-}
-
-void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean,
-                       const float* scale, int sw_RGB = 0, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0)
-{
-    /* only for yolov5s */
-    if (focus == 1 && letterbox_rows > 0 && letterbox_cols > 0)
-    {
-        cv::Mat sample = cv::imread(image_file, 1);
-        cv::Mat img;
-
-        if (sample.channels() == 4)
-        {
-            cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR);
-        }
-        else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 0)
-        {
-            cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR);
-        }
-        else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 1)
-        {
-            cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB);
-        }
-        else if (sample.channels() == 3 && sw_RGB == 1 && img_c != 1)
-        {
-            cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
-        }
-        else if (sample.channels() == 3 && img_c == 1)
-        {
-            cv::cvtColor(sample, img, cv::COLOR_BGR2GRAY);
-        }
-        else
-        {
-            img = sample;
-        }
-
-        /* letterbox process to support different letterbox size */
-        float scale_letterbox;
-        int resize_rows;
-        int resize_cols;
-        if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
-        {
-            scale_letterbox = letterbox_rows * 1.0 / img.rows;
-        }
-        else
-        {
-            scale_letterbox = letterbox_cols * 1.0 / img.cols;
-        }
-        resize_cols = int(scale_letterbox * img.cols);
-        resize_rows = int(scale_letterbox * img.rows);
-
-        cv::resize(img, img, cv::Size(resize_cols, resize_rows));
-        img.convertTo(img, CV_32FC3);
-
-        // Generate a gray image for letterbox using opencv
-        cv::Mat resize_img(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2]));
-        int top = (letterbox_rows - resize_rows) / 2;
-        int bot = (letterbox_rows - resize_rows + 1) / 2;
-        int left = (letterbox_cols - resize_cols) / 2;
-        int right = (letterbox_cols - resize_cols + 1) / 2;
-
-        // Letterbox filling
-        cv::copyMakeBorder(img, resize_img, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2]));
-
-        resize_img.convertTo(resize_img, CV_32FC3);
-        float* img_data   = (float* )resize_img.data;
-        float* input_temp = (float* )malloc(3 * letterbox_rows * letterbox_cols * sizeof(float));
-
-        /* nhwc to nchw */
-        for (int h = 0; h < letterbox_rows; h++)
-        {
-            for (int w = 0; w < letterbox_cols; w++)
-            {
-                for (int c = 0; c < 3; c++)
-                {
-                    int in_index  = h * letterbox_cols * 3 + w * 3 + c;
-                    int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w;
-                    input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c];
-                }
-            }
-        }
-
-        /* focus process */
-        for (int i = 0; i < 2; i++) // corresponding to rows
-        {
-            for (int g = 0; g < 2; g++) // corresponding to cols
-            {
-                for (int c = 0; c < 3; c++)
-                {
-                    for (int h = 0; h < letterbox_rows/2; h++)
-                    {
-                        for (int w = 0; w < letterbox_cols/2; w++)
-                        {
-                            int in_index  = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows +
-                                            h * 2 * letterbox_cols + w * 2;
-                            int out_index = i * 2 * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                            g * 3 * (letterbox_cols/2) * (letterbox_rows/2) +
-                                            c * (letterbox_cols/2) * (letterbox_rows/2) +
-                                            h * (letterbox_cols/2) +
-                                            w;
-
-                            input_data[out_index] = input_temp[in_index];
-                        }
-                    }
-                }
-            }
-        }
-
-        free(input_temp);
-
-        return;
-    }
-
-    cv::Mat sample = cv::imread(image_file, 1);
-    cv::Mat img;
-
-    if (sample.channels() == 4)
-    {
-        cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR);
-    }
-    else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 0)
-    {
-        cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR);
-    }
-    else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 1)
-    {
-        cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB);
-    }
-    else if (sample.channels() == 3 && sw_RGB == 1 && img_c != 1)
-    {
-        cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
-    }
-    else if (sample.channels() == 3 && img_c == 1)
-    {
-        cv::cvtColor(sample, img, cv::COLOR_BGR2GRAY);
-    }
-    else
-    {
-        img = sample;
-    }
-
-    if (center_crop == 1)
-    {
-        int h0 = 0;
-        int w0 = 0;
-        if ( img.rows < img.cols)
-        {
-            h0 = 256;
-            w0 = int(img.cols*(256.0/img.rows));
-        }
-        else
-        {
-            h0 = int(img.rows*(256.0/img.cols));
-            w0 = 256;
-        }
-        int center_h = int(h0/2);
-        int center_w = int(w0/2);
-
-        float* img_data = nullptr;
-
-        cv::resize(img, img, cv::Size(w0, h0));
-        cv::Rect img_roi_box(center_w - 112, center_h - 112, 224, 224);
-        cv::Mat img_crop = img(img_roi_box).clone();
-
-        if (img_c == 3)
-            img_crop.convertTo(img_crop, CV_32FC3);
-        else if (img_c == 1)
-            img_crop.convertTo(img_crop, CV_32FC1);
-        img_data = ( float* )img_crop.data;
-
-        int hw = img_h * img_w;
-        for (int h = 0; h < img_h; h++)
-        {
-            for (int w = 0; w < img_w; w++)
-            {
-                for (int c = 0; c < img_c; c++)
-                {
-                    input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c];
-                    img_data++;
-                }
-            }
-        }
-    }
-    else if (letterbox_rows > 0)
-    {
-        float letterbox_size = (float)letterbox_rows;
-        int resize_h = 0;
-        int resize_w = 0;
-        if (img.rows > img.cols)
-        {
-            resize_h = letterbox_size;
-            resize_w = int(img.cols * (letterbox_size / img.rows));
-        }
-        else
-        {
-            resize_h = int(img.rows * (letterbox_size / img.cols));
-            resize_w = letterbox_size;
-        }
-
-        float* img_data = nullptr;
-
-        cv::resize(img, img, cv::Size(resize_w, resize_h));
-        img.convertTo(img, CV_32FC3);
-        cv::Mat img_new(letterbox_size, letterbox_size, CV_32FC3,
-                        cv::Scalar(0.5/scale[0] + mean[0], 0.5/scale[1] + mean[1], 0.5/ scale[2] + mean[2]));
-        int dh = int((letterbox_size - resize_h) / 2);
-        int dw = int((letterbox_size - resize_w) / 2);
-
-        for (int h = 0; h < resize_h; h++)
-        {
-            for (int w = 0; w < resize_w; w++)
-            {
-                for (int c = 0; c < 3; ++c)
-                {
-                    int in_index  = h * resize_w * 3 + w * 3 + c;
-                    int out_index = (dh + h) * letterbox_size * 3 + (dw + w) * 3 + c;
-
-                    (( float* )img_new.data)[out_index] = (( float* )img.data)[in_index];
-                }
-            }
-        }
-
-        if (img_c == 3)
-            img_new.convertTo(img_new, CV_32FC3);
-        else if (img_c == 1)
-            img_new.convertTo(img_new, CV_32FC1);
-        img_data = ( float* )img_new.data;
-
-        int hw = img_h * img_w;
-        for (int h = 0; h < img_h; h++)
-        {
-            for (int w = 0; w < img_w; w++)
-            {
-                for (int c = 0; c < img_c; c++)
-                {
-                    input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c];
-                    img_data++;
-                }
-            }
-        }
-    }
-    else
-    {
-        cv::resize(img, img, cv::Size(img_w, img_h));
-        if (img_c == 3)
-            img.convertTo(img, CV_32FC3);
-        else if (img_c == 1)
-            img.convertTo(img, CV_32FC1);
-        float* img_data = ( float* )img.data;
-        int hw = img_h * img_w;
-        for (int h = 0; h < img_h; h++)
-        {
-            for (int w = 0; w < img_w; w++)
-            {
-                for (int c = 0; c < img_c; c++)
-                {
-                    input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c];
-                    img_data++;
-                }
-            }
-        }
-    }
-}
-
-void readFileList(std::string basePath, std::vector<std::string>& imgs)
-{
-    DIR *dir;
-    struct dirent *ptr;
-    std::string base;
-
-    if ((dir=opendir(basePath.c_str())) == NULL)
-    {
-        perror("Open dir error...");
-        exit(1); 
-    }
-
-    while ((ptr=readdir(dir)) != NULL)
-    {
-        if(strcmp(ptr->d_name,".")==0 || strcmp(ptr->d_name,"..")==0)    ///current dir OR parrent dir
-            continue;
-        else if(ptr->d_type == 8)    ///file
-        {
-            base = basePath + "/" + ptr->d_name;
-            imgs.push_back(base);
-        }
-        else if(ptr->d_type == 4)    ///dir
-        {
-            readFileList(basePath + "/" + ptr->d_name, imgs);
-        }
-    }
-    closedir(dir);
-}
-
-std::vector<uint32_t> histCount(float *data, uint32_t elem_num, float max_val, float min_val)
-{
-    float bin_scale = (max_val - min_val) / 2047.f;
-    int bin_zp = int(-min_val / bin_scale);
-    std::vector<uint32_t> hist(2048);
-    for (int i = 0; i < elem_num; i++)
-        if (data[i] != 0)
-            hist[uint32_t(data[i] / bin_scale + bin_zp)] ++;
-    return hist;
-}
-
-float compute_kl_divergence(std::vector<float> &dist_a, std::vector<float> &dist_b)
-{
-    const size_t length = dist_a.size();
-    float result = 0;
-
-    for (size_t i = 0; i < length; i++)
-    {
-        if (dist_a[i] != 0)
-        {
-            if (dist_b[i] == 0)
-            {
-                result += 1;
-            }
-            else
-            {
-                result += dist_a[i] * log(dist_a[i] / dist_b[i]);
-            }
-        }
-    }
-
-    return result;
-}
-
-std::vector<float> normalize_histogram(std::vector<uint32_t> &histogram)
-{
-    std::vector<float> histogram_out(histogram.size());
-    const size_t length = histogram.size();
-    float sum = 0;
-
-    for (size_t i = 1; i < length; i++)
-        sum += histogram[i];
-
-    for (size_t i = 1; i < length; i++)
-        histogram_out[i] = float(histogram[i] / sum);
-
-    return histogram_out;
-}
-
-int threshold_distribution(std::vector<uint32_t> &distribution_in, const int target_bin) 
-{
-    int target_threshold = target_bin;
-    float min_kl_divergence = FLT_MAX;
-    const int length = static_cast<int>(distribution_in.size());
-
-    std::vector<float> distribution(distribution_in.size());
-    std::vector<float> quantize_distribution(target_bin);
-    distribution = normalize_histogram(distribution_in);
-
-    float threshold_sum = 0;
-    for (int threshold = target_bin; threshold < length; threshold++)
-    {
-        threshold_sum += distribution[threshold];
-    }
-
-    for (int threshold = target_bin; threshold < length; threshold++)
-    {
-        std::vector<float> t_distribution(distribution.begin(), distribution.begin() + threshold);
-
-        t_distribution[threshold - 1] += threshold_sum;
-        threshold_sum -= distribution[threshold];
-
-        // get P
-        fill(quantize_distribution.begin(), quantize_distribution.end(), 0.0f);
-
-        const float num_per_bin = static_cast<float>(threshold) / static_cast<float>(target_bin);
-
-        for (int i = 0; i < target_bin; i++)
-        {
-            const float start = static_cast<float>(i) * num_per_bin;
-            const float end = start + num_per_bin;
-
-            const int left_upper = static_cast<int>(ceil(start));
-            if (static_cast<float>(left_upper) > start)
-            {
-                const float left_scale = static_cast<float>(left_upper) - start;
-                quantize_distribution[i] += left_scale * distribution[left_upper - 1];
-            }
-
-            const int right_lower = static_cast<int>(floor(end));
-
-            if (static_cast<float>(right_lower) < end)
-            {
-                const float right_scale = end - static_cast<float>(right_lower);
-                quantize_distribution[i] += right_scale * distribution[right_lower];
-            }
-
-            for (int j = left_upper; j < right_lower; j++)
-            {
-                quantize_distribution[i] += distribution[j];
-            }
-        }
-
-        // get Q
-        std::vector<float> expand_distribution(threshold, 0);
-        for (int i = 0; i < target_bin; i++)
-        {
-            const float start = static_cast<float>(i) * num_per_bin;
-            const float end = start + num_per_bin;
-
-            float count = 0;
-
-            const int left_upper = static_cast<int>(ceil(start));
-            float left_scale = 0;
-            if (static_cast<float>(left_upper) > start)
-            {
-                left_scale = static_cast<float>(left_upper) - start;
-                if (distribution[left_upper - 1] != 0)
-                {
-                    count += left_scale;
-                }
-            }
-
-            const int right_lower = static_cast<int>(floor(end));
-            float right_scale = 0;
-            if (static_cast<float>(right_lower) < end)
-            {
-                right_scale = end - static_cast<float>(right_lower);
-                if (distribution[right_lower] != 0)
-                {
-                    count += right_scale;
-                }
-            }
-
-            for (int j = left_upper; j < right_lower; j++)
-            {
-                if (distribution[j] != 0)
-                {
-                    count++;
-                }
-            }
-
-            const float expand_value = quantize_distribution[i] / count;
-
-            if (static_cast<float>(left_upper) > start)
-            {
-                if (distribution[left_upper - 1] != 0)
-                {
-                    expand_distribution[left_upper - 1] += expand_value * left_scale;
-                }
-            }
-            if (static_cast<float>(right_lower) < end)
-            {
-                if (distribution[right_lower] != 0)
-                {
-                    expand_distribution[right_lower] += expand_value * right_scale;
-                }
-            }
-            for (int j = left_upper; j < right_lower; j++)
-            {
-                if (distribution[j] != 0)
-                {
-                    expand_distribution[j] += expand_value;
-                }
-            }
-        }
-
-        const float kl_divergence = compute_kl_divergence(t_distribution, expand_distribution);
-
-        // the best num of bin
-        if (kl_divergence < min_kl_divergence)
-        {
-            min_kl_divergence = kl_divergence;
-            target_threshold = threshold;
-        }
-    }
-
-    return target_threshold;
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include <dirent.h>
+#include <string.h>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#ifdef _MSC_VER
+#include "getopt.h"
+#else
+#include <unistd.h>
+#endif
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else // _WIN32
+#include <sys/time.h>
+#endif // _WIN32
+
+#include "quant_utils.hpp"
+
+#ifdef _WIN32
+double get_current_time()
+{
+    LARGE_INTEGER freq;
+    LARGE_INTEGER pc;
+    QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&pc);
+
+    return pc.QuadPart * 1000.0 / freq.QuadPart;
+}
+#else  // _WIN32
+
+double get_current_time()
+{
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+
+    return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
+}
+#endif // _WIN32
+
+void split(float* array, char* str, const char* del)
+{
+    char* s = nullptr;
+    s = strtok(str, del);
+    while (s != nullptr)
+    {
+        *array++ = atof(s);
+        s = strtok(nullptr, del);
+    }
+}
+
+void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean,
+                       const float* scale, int sw_RGB = 0, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0)
+{
+    /* only for yolov5s */
+    if (focus == 1 && letterbox_rows > 0 && letterbox_cols > 0)
+    {
+        cv::Mat sample = cv::imread(image_file, 1);
+        cv::Mat img;
+
+        if (sample.channels() == 4)
+        {
+            cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR);
+        }
+        else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 0)
+        {
+            cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR);
+        }
+        else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 1)
+        {
+            cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB);
+        }
+        else if (sample.channels() == 3 && sw_RGB == 1 && img_c != 1)
+        {
+            cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
+        }
+        else if (sample.channels() == 3 && img_c == 1)
+        {
+            cv::cvtColor(sample, img, cv::COLOR_BGR2GRAY);
+        }
+        else
+        {
+            img = sample;
+        }
+
+        /* letterbox process to support different letterbox size */
+        float scale_letterbox;
+        int resize_rows;
+        int resize_cols;
+        if ((letterbox_rows * 1.0 / img.rows) < (letterbox_cols * 1.0 / img.cols))
+        {
+            scale_letterbox = letterbox_rows * 1.0 / img.rows;
+        }
+        else
+        {
+            scale_letterbox = letterbox_cols * 1.0 / img.cols;
+        }
+        resize_cols = int(scale_letterbox * img.cols);
+        resize_rows = int(scale_letterbox * img.rows);
+
+        cv::resize(img, img, cv::Size(resize_cols, resize_rows));
+        img.convertTo(img, CV_32FC3);
+
+        // Generate a gray image for letterbox using opencv
+        cv::Mat resize_img(letterbox_cols, letterbox_rows, CV_32FC3, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2]));
+        int top = (letterbox_rows - resize_rows) / 2;
+        int bot = (letterbox_rows - resize_rows + 1) / 2;
+        int left = (letterbox_cols - resize_cols) / 2;
+        int right = (letterbox_cols - resize_cols + 1) / 2;
+
+        // Letterbox filling
+        cv::copyMakeBorder(img, resize_img, top, bot, left, right, cv::BORDER_CONSTANT, cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2]));
+
+        resize_img.convertTo(resize_img, CV_32FC3);
+        float* img_data = (float*)resize_img.data;
+        float* input_temp = (float*)malloc(3 * letterbox_rows * letterbox_cols * sizeof(float));
+
+        /* nhwc to nchw */
+        for (int h = 0; h < letterbox_rows; h++)
+        {
+            for (int w = 0; w < letterbox_cols; w++)
+            {
+                for (int c = 0; c < 3; c++)
+                {
+                    int in_index = h * letterbox_cols * 3 + w * 3 + c;
+                    int out_index = c * letterbox_rows * letterbox_cols + h * letterbox_cols + w;
+                    input_temp[out_index] = (img_data[in_index] - mean[c]) * scale[c];
+                }
+            }
+        }
+
+        /* focus process */
+        for (int i = 0; i < 2; i++) // corresponding to rows
+        {
+            for (int g = 0; g < 2; g++) // corresponding to cols
+            {
+                for (int c = 0; c < 3; c++)
+                {
+                    for (int h = 0; h < letterbox_rows / 2; h++)
+                    {
+                        for (int w = 0; w < letterbox_cols / 2; w++)
+                        {
+                            int in_index = i + g * letterbox_cols + c * letterbox_cols * letterbox_rows + h * 2 * letterbox_cols + w * 2;
+                            int out_index = i * 2 * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + g * 3 * (letterbox_cols / 2) * (letterbox_rows / 2) + c * (letterbox_cols / 2) * (letterbox_rows / 2) + h * (letterbox_cols / 2) + w;
+
+                            input_data[out_index] = input_temp[in_index];
+                        }
+                    }
+                }
+            }
+        }
+
+        free(input_temp);
+
+        return;
+    }
+
+    cv::Mat sample = cv::imread(image_file, 1);
+    cv::Mat img;
+
+    if (sample.channels() == 4)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR);
+    }
+    else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 0)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR);
+    }
+    else if (sample.channels() == 1 && img_c == 3 && sw_RGB == 1)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_GRAY2RGB);
+    }
+    else if (sample.channels() == 3 && sw_RGB == 1 && img_c != 1)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
+    }
+    else if (sample.channels() == 3 && img_c == 1)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_BGR2GRAY);
+    }
+    else
+    {
+        img = sample;
+    }
+
+    if (center_crop == 1)
+    {
+        int h0 = 0;
+        int w0 = 0;
+        if (img.rows < img.cols)
+        {
+            h0 = 256;
+            w0 = int(img.cols * (256.0 / img.rows));
+        }
+        else
+        {
+            h0 = int(img.rows * (256.0 / img.cols));
+            w0 = 256;
+        }
+        int center_h = int(h0 / 2);
+        int center_w = int(w0 / 2);
+
+        float* img_data = nullptr;
+
+        cv::resize(img, img, cv::Size(w0, h0));
+        cv::Rect img_roi_box(center_w - 112, center_h - 112, 224, 224);
+        cv::Mat img_crop = img(img_roi_box).clone();
+
+        if (img_c == 3)
+            img_crop.convertTo(img_crop, CV_32FC3);
+        else if (img_c == 1)
+            img_crop.convertTo(img_crop, CV_32FC1);
+        img_data = (float*)img_crop.data;
+
+        int hw = img_h * img_w;
+        for (int h = 0; h < img_h; h++)
+        {
+            for (int w = 0; w < img_w; w++)
+            {
+                for (int c = 0; c < img_c; c++)
+                {
+                    input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c];
+                    img_data++;
+                }
+            }
+        }
+    }
+    else if (letterbox_rows > 0)
+    {
+        float letterbox_size = (float)letterbox_rows;
+        int resize_h = 0;
+        int resize_w = 0;
+        if (img.rows > img.cols)
+        {
+            resize_h = letterbox_size;
+            resize_w = int(img.cols * (letterbox_size / img.rows));
+        }
+        else
+        {
+            resize_h = int(img.rows * (letterbox_size / img.cols));
+            resize_w = letterbox_size;
+        }
+
+        float* img_data = nullptr;
+
+        cv::resize(img, img, cv::Size(resize_w, resize_h));
+        img.convertTo(img, CV_32FC3);
+        cv::Mat img_new(letterbox_size, letterbox_size, CV_32FC3,
+                        cv::Scalar(0.5 / scale[0] + mean[0], 0.5 / scale[1] + mean[1], 0.5 / scale[2] + mean[2]));
+        int dh = int((letterbox_size - resize_h) / 2);
+        int dw = int((letterbox_size - resize_w) / 2);
+
+        for (int h = 0; h < resize_h; h++)
+        {
+            for (int w = 0; w < resize_w; w++)
+            {
+                for (int c = 0; c < 3; ++c)
+                {
+                    int in_index = h * resize_w * 3 + w * 3 + c;
+                    int out_index = (dh + h) * letterbox_size * 3 + (dw + w) * 3 + c;
+
+                    ((float*)img_new.data)[out_index] = ((float*)img.data)[in_index];
+                }
+            }
+        }
+
+        if (img_c == 3)
+            img_new.convertTo(img_new, CV_32FC3);
+        else if (img_c == 1)
+            img_new.convertTo(img_new, CV_32FC1);
+        img_data = (float*)img_new.data;
+
+        int hw = img_h * img_w;
+        for (int h = 0; h < img_h; h++)
+        {
+            for (int w = 0; w < img_w; w++)
+            {
+                for (int c = 0; c < img_c; c++)
+                {
+                    input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c];
+                    img_data++;
+                }
+            }
+        }
+    }
+    else
+    {
+        cv::resize(img, img, cv::Size(img_w, img_h));
+        if (img_c == 3)
+            img.convertTo(img, CV_32FC3);
+        else if (img_c == 1)
+            img.convertTo(img, CV_32FC1);
+        float* img_data = (float*)img.data;
+        int hw = img_h * img_w;
+        for (int h = 0; h < img_h; h++)
+        {
+            for (int w = 0; w < img_w; w++)
+            {
+                for (int c = 0; c < img_c; c++)
+                {
+                    input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c];
+                    img_data++;
+                }
+            }
+        }
+    }
+}
+
+void readFileList(std::string basePath, std::vector<std::string>& imgs)
+{
+    DIR* dir;
+    struct dirent* ptr;
+    std::string base;
+
+    if ((dir = opendir(basePath.c_str())) == NULL)
+    {
+        perror("Open dir error...");
+        exit(1);
+    }
+
+    while ((ptr = readdir(dir)) != NULL)
+    {
+        if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) ///current dir OR parrent dir
+            continue;
+        else if (ptr->d_type == 8) ///file
+        {
+            base = basePath + "/" + ptr->d_name;
+            imgs.push_back(base);
+        }
+        else if (ptr->d_type == 4) ///dir
+        {
+            readFileList(basePath + "/" + ptr->d_name, imgs);
+        }
+    }
+    closedir(dir);
+}
+
+std::vector<uint32_t> histCount(float* data, uint32_t elem_num, float max_val, float min_val)
+{
+    float bin_scale = (max_val - min_val) / 2047.f;
+    int bin_zp = int(-min_val / bin_scale);
+    std::vector<uint32_t> hist(2048);
+    for (int i = 0; i < elem_num; i++)
+        if (data[i] != 0)
+            hist[uint32_t(data[i] / bin_scale + bin_zp)]++;
+    return hist;
+}
+
+float compute_kl_divergence(std::vector<float>& dist_a, std::vector<float>& dist_b)
+{
+    const size_t length = dist_a.size();
+    float result = 0;
+
+    for (size_t i = 0; i < length; i++)
+    {
+        if (dist_a[i] != 0)
+        {
+            if (dist_b[i] == 0)
+            {
+                result += 1;
+            }
+            else
+            {
+                result += dist_a[i] * log(dist_a[i] / dist_b[i]);
+            }
+        }
+    }
+
+    return result;
+}
+
+std::vector<float> normalize_histogram(std::vector<uint32_t>& histogram)
+{
+    std::vector<float> histogram_out(histogram.size());
+    const size_t length = histogram.size();
+    float sum = 0;
+
+    for (size_t i = 1; i < length; i++)
+        sum += histogram[i];
+
+    for (size_t i = 1; i < length; i++)
+        histogram_out[i] = float(histogram[i] / sum);
+
+    return histogram_out;
+}
+
+int threshold_distribution(std::vector<uint32_t>& distribution_in, const int target_bin)
+{
+    int target_threshold = target_bin;
+    float min_kl_divergence = FLT_MAX;
+    const int length = static_cast<int>(distribution_in.size());
+
+    std::vector<float> distribution(distribution_in.size());
+    std::vector<float> quantize_distribution(target_bin);
+    distribution = normalize_histogram(distribution_in);
+
+    float threshold_sum = 0;
+    for (int threshold = target_bin; threshold < length; threshold++)
+    {
+        threshold_sum += distribution[threshold];
+    }
+
+    for (int threshold = target_bin; threshold < length; threshold++)
+    {
+        std::vector<float> t_distribution(distribution.begin(), distribution.begin() + threshold);
+
+        t_distribution[threshold - 1] += threshold_sum;
+        threshold_sum -= distribution[threshold];
+
+        // get P
+        fill(quantize_distribution.begin(), quantize_distribution.end(), 0.0f);
+
+        const float num_per_bin = static_cast<float>(threshold) / static_cast<float>(target_bin);
+
+        for (int i = 0; i < target_bin; i++)
+        {
+            const float start = static_cast<float>(i) * num_per_bin;
+            const float end = start + num_per_bin;
+
+            const int left_upper = static_cast<int>(ceil(start));
+            if (static_cast<float>(left_upper) > start)
+            {
+                const float left_scale = static_cast<float>(left_upper) - start;
+                quantize_distribution[i] += left_scale * distribution[left_upper - 1];
+            }
+
+            const int right_lower = static_cast<int>(floor(end));
+
+            if (static_cast<float>(right_lower) < end)
+            {
+                const float right_scale = end - static_cast<float>(right_lower);
+                quantize_distribution[i] += right_scale * distribution[right_lower];
+            }
+
+            for (int j = left_upper; j < right_lower; j++)
+            {
+                quantize_distribution[i] += distribution[j];
+            }
+        }
+
+        // get Q
+        std::vector<float> expand_distribution(threshold, 0);
+        for (int i = 0; i < target_bin; i++)
+        {
+            const float start = static_cast<float>(i) * num_per_bin;
+            const float end = start + num_per_bin;
+
+            float count = 0;
+
+            const int left_upper = static_cast<int>(ceil(start));
+            float left_scale = 0;
+            if (static_cast<float>(left_upper) > start)
+            {
+                left_scale = static_cast<float>(left_upper) - start;
+                if (distribution[left_upper - 1] != 0)
+                {
+                    count += left_scale;
+                }
+            }
+
+            const int right_lower = static_cast<int>(floor(end));
+            float right_scale = 0;
+            if (static_cast<float>(right_lower) < end)
+            {
+                right_scale = end - static_cast<float>(right_lower);
+                if (distribution[right_lower] != 0)
+                {
+                    count += right_scale;
+                }
+            }
+
+            for (int j = left_upper; j < right_lower; j++)
+            {
+                if (distribution[j] != 0)
+                {
+                    count++;
+                }
+            }
+
+            const float expand_value = quantize_distribution[i] / count;
+
+            if (static_cast<float>(left_upper) > start)
+            {
+                if (distribution[left_upper - 1] != 0)
+                {
+                    expand_distribution[left_upper - 1] += expand_value * left_scale;
+                }
+            }
+            if (static_cast<float>(right_lower) < end)
+            {
+                if (distribution[right_lower] != 0)
+                {
+                    expand_distribution[right_lower] += expand_value * right_scale;
+                }
+            }
+            for (int j = left_upper; j < right_lower; j++)
+            {
+                if (distribution[j] != 0)
+                {
+                    expand_distribution[j] += expand_value;
+                }
+            }
+        }
+
+        const float kl_divergence = compute_kl_divergence(t_distribution, expand_distribution);
+
+        // the best num of bin
+        if (kl_divergence < min_kl_divergence)
+        {
+            min_kl_divergence = kl_divergence;
+            target_threshold = threshold;
+        }
+    }
+
+    return target_threshold;
+}
diff --git a/tools/quantize/quant_utils.hpp b/tools/quantize/quant_utils.hpp
index 6440a8708..4ad636763 100644
--- a/tools/quantize/quant_utils.hpp
+++ b/tools/quantize/quant_utils.hpp
@@ -1,49 +1,48 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: hhchen@openailab.com
- */
-#pragma once
-
-#include <cstdlib>
-#include <cstdio>
-#include <sys/stat.h>
-#include <vector>
-#include <string>
-#include <cmath>
-
-
-double get_current_time();
-
-void split(float* array, char* str, const char* del);
-
-void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean,
-                       const float* scale, int sw_RGB, int center_crop, int letterbox_rows, int letterbox_cols, int focus);
-
-void readFileList(std::string basePath, std::vector<std::string>& imgs);
-
-std::vector<uint32_t> histCount(float *data, uint32_t elem_num, float max_val, float min_val);
-
-float compute_kl_divergence(std::vector<float> &dist_a, std::vector<float> &dist_b);
-
-std::vector<float> normalize_histogram(std::vector<uint32_t> &histogram);
-
-int threshold_distribution(std::vector<uint32_t> &distribution_in, const int target_bin);
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+#pragma once
+
+#include <cstdlib>
+#include <cstdio>
+#include <sys/stat.h>
+#include <vector>
+#include <string>
+#include <cmath>
+
+double get_current_time();
+
+void split(float* array, char* str, const char* del);
+
+void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean,
+                       const float* scale, int sw_RGB, int center_crop, int letterbox_rows, int letterbox_cols, int focus);
+
+void readFileList(std::string basePath, std::vector<std::string>& imgs);
+
+std::vector<uint32_t> histCount(float* data, uint32_t elem_num, float max_val, float min_val);
+
+float compute_kl_divergence(std::vector<float>& dist_a, std::vector<float>& dist_b);
+
+std::vector<float> normalize_histogram(std::vector<uint32_t>& histogram);
+
+int threshold_distribution(std::vector<uint32_t>& distribution_in, const int target_bin);
diff --git a/tools/quantize/savegraph/save_graph.cpp b/tools/quantize/savegraph/save_graph.cpp
index 92b24b443..b528ca2fc 100644
--- a/tools/quantize/savegraph/save_graph.cpp
+++ b/tools/quantize/savegraph/save_graph.cpp
@@ -35,7 +35,7 @@ bool IsSaveString(void)
 {
     const char* env = std::getenv("TM_NO_STRING");
 
-    if(env)
+    if (env)
         return false;
     else
         return true;
@@ -45,7 +45,7 @@ bool IsSaveData(void)
 {
     const char* env = std::getenv("TM_FOR_BENCHMARK");
 
-    if(env)
+    if (env)
         return false;
     else
         return true;
@@ -53,7 +53,7 @@ bool IsSaveData(void)
 
 bool RegisterOpSaveMethod(const uint16_t& op_type, const op_save_t& save_func)
 {
-    if(op_save_map_.count(op_type))
+    if (op_save_map_.count(op_type))
         return false;
 
     op_save_map_[op_type] = save_func;
@@ -61,7 +61,7 @@ bool RegisterOpSaveMethod(const uint16_t& op_type, const op_save_t& save_func)
 }
 
 tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct tensor* tensor,
-                                         unsigned int tensor_id, unsigned int buffer_id)
+                          unsigned int tensor_id, unsigned int buffer_id)
 {
     TM2_Tensor tm_tensor;
     tm_tensor.tensor_id = tensor_id;
@@ -72,11 +72,11 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t
 
     bool tm_with_string = IsSaveString();
 
-    if(tm_with_string)
+    if (tm_with_string)
     {
         std::string name = tensor->name;
         TM2_String tensor_name;
-        tensor_name.size = name.size() + 1;    // including trailing \0
+        tensor_name.size = name.size() + 1; // including trailing \0
         tensor_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), tensor_name.size);
         tm_tensor.offset_s_tname = WriteTmObject(start_ptr, cur_pos, &tensor_name, sizeof(TM2_String));
     }
@@ -86,13 +86,13 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t
     /* Get the dims of the tensor */
     int* dim = tensor->dims;
     size_t vector_size;
-    if(tensor->dim_num)
+    if (tensor->dim_num)
     {
         /* Write the vector of dims */
         vector_size = sizeof(tm_size_t) + sizeof(int32_t) * tensor->dim_num;
-        TM2_Vector_dims* v_dims = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_dims = (TM2_Vector_dims*)malloc(vector_size);
         v_dims->v_num = tensor->dim_num;
-        for(unsigned int i = 0; i < tensor->dim_num; i++)
+        for (unsigned int i = 0; i < tensor->dim_num; i++)
         {
             v_dims->dims[i] = dim[i];
         }
@@ -103,10 +103,10 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t
         tm_tensor.offset_vd_dims = TM2_NOT_SET;
 
     /* Write the quant params */
-    if(tensor->quant_param_num != 0)
+    if (tensor->quant_param_num != 0)
     {
         vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor->quant_param_num;
-        TM2_Vector_offsets* v_qtparams = ( TM2_Vector_offsets* )malloc(vector_size);
+        TM2_Vector_offsets* v_qtparams = (TM2_Vector_offsets*)malloc(vector_size);
         v_qtparams->v_num = tensor->quant_param_num;
         if (v_qtparams->v_num == 1)
         {
@@ -117,7 +117,7 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t
         }
         else if (v_qtparams->v_num > 1)
         {
-            for(unsigned int i = 0; i < v_qtparams->v_num; i++)
+            for (unsigned int i = 0; i < v_qtparams->v_num; i++)
             {
                 TM2_QuantParam qtparam;
                 qtparam.zero_point = tensor->zp_list[i];
@@ -126,7 +126,6 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t
                 v_qtparams->offsets[i] = WriteTmObject(start_ptr, cur_pos, &qtparam, sizeof(TM2_QuantParam));
             }
         }
-        
 
         /* Write the vector of quant params */
         tm_tensor.offect_vo_quantparams = WriteTmObject(start_ptr, cur_pos, v_qtparams, vector_size);
@@ -139,20 +138,20 @@ tm_uoffset_t SaveTmTensor(void* const start_ptr, tm_uoffset_t* cur_pos, struct t
 }
 
 tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, struct graph* graph, struct node* node,
-                                       name_map_t& tensor_name_map)
+                        name_map_t& tensor_name_map)
 {
     TM2_Node tm_node;
-    memset(&tm_node, 0 , sizeof(TM2_Node));
+    memset(&tm_node, 0, sizeof(TM2_Node));
     tm_node.node_id = node->index;
     tm_node.dynamic_shape = node->dynamic_shape;
 
     bool tm_with_string = IsSaveString();
 
-    if(tm_with_string)
+    if (tm_with_string)
     {
         std::string name = node->name;
         TM2_String node_name;
-        node_name.size = name.size() + 1;    // including trailing \0
+        node_name.size = name.size() + 1; // including trailing \0
         node_name.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, name.c_str(), node_name.size);
         tm_node.offset_s_nname = WriteTmObject(start_ptr, cur_pos, &node_name, sizeof(TM2_String));
     }
@@ -162,13 +161,13 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, struct gra
     unsigned int input_num = node->input_num;
     unsigned int output_num = node->output_num;
 
-    if(input_num)
+    if (input_num)
     {
         /* Write the vector of input indices */
         size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * input_num;
-        TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size);
+        TM2_Vector_indices* v_input_indices = (TM2_Vector_indices*)malloc(vector_size);
         v_input_indices->v_num = input_num;
-        for(unsigned int i = 0; i < input_num; i++)
+        for (unsigned int i = 0; i < input_num; i++)
         {
             struct tensor* p_tensor = get_ir_graph_tensor(graph, node->input_tensors[i]);
             v_input_indices->indices[i] = tensor_name_map[p_tensor->name];
@@ -179,13 +178,13 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, struct gra
     else
         tm_node.offset_vi_input_tensors = TM2_NOT_SET;
 
-    if(output_num)
+    if (output_num)
     {
         /* Write the vector of output indices */
         size_t vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * output_num;
-        TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size);
+        TM2_Vector_indices* v_output_indices = (TM2_Vector_indices*)malloc(vector_size);
         v_output_indices->v_num = output_num;
-        for(unsigned int i = 0; i < output_num; i++)
+        for (unsigned int i = 0; i < output_num; i++)
         {
             struct tensor* p_tensor = get_ir_graph_tensor(graph, node->output_tensors[i]);
             v_output_indices->indices[i] = tensor_name_map[p_tensor->name];
@@ -198,7 +197,7 @@ tm_uoffset_t SaveTmNode(void* const start_ptr, tm_uoffset_t* cur_pos, struct gra
 
     /* Write tm operator */
     uint16_t op_type = node->op.type;
-    if(!op_save_map_.count(op_type))
+    if (!op_save_map_.count(op_type))
     {
         TLOG_ERR("cannot find save function for operator:%d \n", op_type);
         return false;
@@ -230,12 +229,12 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct
 
     /* Write the nodes */
     size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * graph->node_num;
-    TM2_Vector_offsets* v_nodes = ( TM2_Vector_offsets* )malloc(vector_size);
+    TM2_Vector_offsets* v_nodes = (TM2_Vector_offsets*)malloc(vector_size);
     v_nodes->v_num = graph->node_num;
-    for(unsigned int i = 0; i < graph->node_num; i++)
+    for (unsigned int i = 0; i < graph->node_num; i++)
     {
         struct node* p_node = get_ir_graph_node(graph, i);
-        for(unsigned int k = 0; k < p_node->output_num; k++)
+        for (unsigned int k = 0; k < p_node->output_num; k++)
         {
             struct tensor* p_tensor = get_ir_graph_tensor(graph, p_node->output_tensors[k]);
             tensor_ptrs.push_back(p_tensor);
@@ -249,12 +248,12 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct
 
     /* Write the tensors */
     vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * tensor_num;
-    TM2_Vector_offsets* v_tensors = ( TM2_Vector_offsets* )malloc(vector_size);
+    TM2_Vector_offsets* v_tensors = (TM2_Vector_offsets*)malloc(vector_size);
     v_tensors->v_num = tensor_num;
-    for(unsigned int i = 0; i < tensor_num; i++)
+    for (unsigned int i = 0; i < tensor_num; i++)
     {
         struct tensor* p_tensor = tensor_ptrs[i];
-        if(p_tensor->tensor_type == TENSOR_TYPE_CONST)
+        if (p_tensor->tensor_type == TENSOR_TYPE_CONST)
         {
             // buf_ptrs.push_back(p_tensor->GetMemAddr());
             buf_ptrs.push_back(p_tensor->data); // may cause bug
@@ -269,14 +268,14 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct
 
     /* Write the buffers */
     vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * buffer_num;
-    TM2_Vector_offsets* v_buffers = ( TM2_Vector_offsets* )malloc(vector_size);
+    TM2_Vector_offsets* v_buffers = (TM2_Vector_offsets*)malloc(vector_size);
     v_buffers->v_num = buffer_num;
-    for(unsigned int i = 0; i < buffer_num; i++)
+    for (unsigned int i = 0; i < buffer_num; i++)
     {
         TM2_Buffer tm_buf;
         tm_buf.size = buf_sizes[i];
 
-        if(tm_no_data)
+        if (tm_no_data)
         {
             /* TM2_FOR_BENCHMARK environment variable exists. Not write buf data into the tm file */
             tm_buf.offset_data = TM2_NOT_SET;
@@ -284,8 +283,7 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct
         else
         {
             /* TM2_FOR_BENCHMARK environment variable does not exist */
-            tm_buf.offset_data =
-                WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast<const uint8_t*>(buf_ptrs[i]), tm_buf.size);
+            tm_buf.offset_data = WriteTmFileAlign1(start_ptr, cur_pos, reinterpret_cast<const uint8_t*>(buf_ptrs[i]), tm_buf.size);
         }
         v_buffers->offsets[i] = WriteTmObject(start_ptr, cur_pos, &tm_buf, sizeof(TM2_Buffer));
     }
@@ -294,9 +292,9 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct
 
     /* Write the vector of input indices */
     vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->input_num;
-    TM2_Vector_indices* v_input_indices = ( TM2_Vector_indices* )malloc(vector_size);
+    TM2_Vector_indices* v_input_indices = (TM2_Vector_indices*)malloc(vector_size);
     v_input_indices->v_num = graph->input_num;
-    for(unsigned int i = 0; i < graph->input_num; i++)
+    for (unsigned int i = 0; i < graph->input_num; i++)
     {
         v_input_indices->indices[i] = graph->input_nodes[i];
     }
@@ -304,9 +302,9 @@ tm_uoffset_t SaveTmSubgraph(void* const start_ptr, tm_uoffset_t* cur_pos, struct
 
     /* Write the vector of output indices */
     vector_size = sizeof(tm_size_t) + sizeof(uint32_t) * graph->output_num;
-    TM2_Vector_indices* v_output_indices = ( TM2_Vector_indices* )malloc(vector_size);
+    TM2_Vector_indices* v_output_indices = (TM2_Vector_indices*)malloc(vector_size);
     v_output_indices->v_num = graph->output_num;
-    for(unsigned int i = 0; i < graph->output_num; i++)
+    for (unsigned int i = 0; i < graph->output_num; i++)
     {
         v_output_indices->indices[i] = graph->output_nodes[i];
     }
@@ -356,7 +354,7 @@ bool SaveModelIntoMem(void* start_ptr, struct graph* graph, uint32_t* tm_model_s
     /* Write the subgraphs */
     /* Only 1 subgraph is supported currently */
     size_t vector_size = sizeof(tm_size_t) + sizeof(tm_uoffset_t) * 1;
-    TM2_Vector_offsets* v_subgraphs = ( TM2_Vector_offsets* )malloc(vector_size);
+    TM2_Vector_offsets* v_subgraphs = (TM2_Vector_offsets*)malloc(vector_size);
     v_subgraphs->v_num = 1;
     v_subgraphs->offsets[0] = SaveTmSubgraph(start_ptr, &cur_pos, graph);
 
@@ -382,17 +380,16 @@ int save_model(std::vector<void*>& addr_list, std::vector<int>& size_list, struc
 
     uint32_t malloc_size = TM_FILE_MAX_SIZE;
     const char* env = std::getenv("TM_FILE_MAX_SIZE");
-    if(env)
+    if (env)
         malloc_size = std::atoi(env);
 
-    void* start_ptr = ( void* )malloc(malloc_size);
-    if(start_ptr == nullptr)
+    void* start_ptr = (void*)malloc(malloc_size);
+    if (start_ptr == nullptr)
     {
-        TLOG_ERR("Malloc memory failed: .\n",malloc_size);
+        TLOG_ERR("Malloc memory failed: .\n", malloc_size);
         return false;
     }
 
-
     bool ret = SaveModelIntoMem(start_ptr, graph, &tm_model_size);
 
     addr_list.push_back(start_ptr);
@@ -411,16 +408,16 @@ bool save_graph(graph_t graph, const char* fname)
     struct graph* ir_graph = (struct graph*)graph;
     /* Open the tengine model file */
     int fd = open(fname, O_RDWR | O_CREAT | O_TRUNC, 0666);
-    if(fd == -1)
+    if (fd == -1)
     {
-        TLOG_ERR("Could not open %s\n",fname);
+        TLOG_ERR("Could not open %s\n", fname);
         return false;
     }
 
     std::vector<void*> addr_list;
     std::vector<int> size_list;
 
-    if(!save_model(addr_list, size_list, ir_graph))
+    if (!save_model(addr_list, size_list, ir_graph))
     {
         close(fd);
         return false;
@@ -433,7 +430,7 @@ bool save_graph(graph_t graph, const char* fname)
     close(fd);
     free(buf);
 
-    if(ret != size)
+    if (ret != size)
         return false;
     else
         return true;
diff --git a/tools/quantize/savegraph/save_graph.hpp b/tools/quantize/savegraph/save_graph.hpp
index 5ed1757f4..fe4fb2d0f 100644
--- a/tools/quantize/savegraph/save_graph.hpp
+++ b/tools/quantize/savegraph/save_graph.hpp
@@ -9,8 +9,7 @@
 #include <fcntl.h>
 #include <functional>
 
-extern "C" 
-{
+extern "C" {
 #include "tengine/c_api.h"
 #include "graph/graph.h"
 #include "graph/subgraph.h"
@@ -21,8 +20,6 @@ extern "C"
 #include "tm2_format.h"
 }
 
-
 #include "tm2_op_save.hpp"
 
-
 bool save_graph(graph_t graph, const char* fname);
diff --git a/tools/quantize/savegraph/tm2_format.h b/tools/quantize/savegraph/tm2_format.h
index 5fb2aea3b..fc4fa32a4 100644
--- a/tools/quantize/savegraph/tm2_format.h
+++ b/tools/quantize/savegraph/tm2_format.h
@@ -32,8 +32,8 @@
 extern "C" {
 #endif
 
-#define TM2_FILE_VER_MAIN 2
-#define TM2_FILE_VER_SUB 0
+#define TM2_FILE_VER_MAIN    2
+#define TM2_FILE_VER_SUB     0
 #define TM2_FILE_VER_COMPILE 0
 
 #define TM2_OP_VER 1
@@ -42,247 +42,247 @@ extern "C" {
 
 /* Type define */
 typedef uint32_t tm_uoffset_t; /* offset is 4-byte unsigned integer */
-typedef uint32_t tm_size_t; /* size is 4-byte unsigned integer */
-typedef uint8_t tm_bool_t; /* bool is 1-byte unsigned integer */
+typedef uint32_t tm_size_t;    /* size is 4-byte unsigned integer */
+typedef uint8_t tm_bool_t;     /* bool is 1-byte unsigned integer */
 
 /* Operator strings */
-#define TM2_OPSTR_ACCURACY "Accuracy"
-#define TM2_OPSTR_BATCHNORMALIZATION "BatchNormalization"
-#define TM2_OPSTR_BILINEARRESIZE "Resize"
-#define TM2_OPSTR_CONCAT "Concat"
-#define TM2_OPSTR_CONST "Const"
-#define TM2_OPSTR_CONVOLUTION "Convolution"
-#define TM2_OPSTR_DECONVOLUTION "Deconvolution"
-#define TM2_OPSTR_DETECTIONOUTPUT "DetectionOutput"
-#define TM2_OPSTR_DROPOUT "Dropout"
-#define TM2_OPSTR_ELTWISE "Eltwise"
-#define TM2_OPSTR_FLATTEN "Flatten"
-#define TM2_OPSTR_FULLYCONNECTED "FullyConnected"
-#define TM2_OPSTR_INPUTOP "InputOp"
-#define TM2_OPSTR_LRN "LRN"
-#define TM2_OPSTR_NORMALIZE "Normalize"
-#define TM2_OPSTR_PERMUTE "Permute"
-#define TM2_OPSTR_POOLING "Pooling"
-#define TM2_OPSTR_PRELU "PReLU"
-#define TM2_OPSTR_PRIORBOX "PriorBox"
-#define TM2_OPSTR_REGION "Region"
-#define TM2_OPSTR_RELU "ReLu"
-#define TM2_OPSTR_RELU6 "ReLu6"
-#define TM2_OPSTR_REORG "Reorg"
-#define TM2_OPSTR_RESHAPE "Reshape"
-#define TM2_OPSTR_ROIPOOLING "ROIPooling"
-#define TM2_OPSTR_RPN "RPN"
-#define TM2_OPSTR_SCALE "Scale"
-#define TM2_OPSTR_SLICE "Slice"
-#define TM2_OPSTR_SOFTMAX "Softmax"
-#define TM2_OPSTR_SPLIT "Split"
+#define TM2_OPSTR_ACCURACY             "Accuracy"
+#define TM2_OPSTR_BATCHNORMALIZATION   "BatchNormalization"
+#define TM2_OPSTR_BILINEARRESIZE       "Resize"
+#define TM2_OPSTR_CONCAT               "Concat"
+#define TM2_OPSTR_CONST                "Const"
+#define TM2_OPSTR_CONVOLUTION          "Convolution"
+#define TM2_OPSTR_DECONVOLUTION        "Deconvolution"
+#define TM2_OPSTR_DETECTIONOUTPUT      "DetectionOutput"
+#define TM2_OPSTR_DROPOUT              "Dropout"
+#define TM2_OPSTR_ELTWISE              "Eltwise"
+#define TM2_OPSTR_FLATTEN              "Flatten"
+#define TM2_OPSTR_FULLYCONNECTED       "FullyConnected"
+#define TM2_OPSTR_INPUTOP              "InputOp"
+#define TM2_OPSTR_LRN                  "LRN"
+#define TM2_OPSTR_NORMALIZE            "Normalize"
+#define TM2_OPSTR_PERMUTE              "Permute"
+#define TM2_OPSTR_POOLING              "Pooling"
+#define TM2_OPSTR_PRELU                "PReLU"
+#define TM2_OPSTR_PRIORBOX             "PriorBox"
+#define TM2_OPSTR_REGION               "Region"
+#define TM2_OPSTR_RELU                 "ReLu"
+#define TM2_OPSTR_RELU6                "ReLu6"
+#define TM2_OPSTR_REORG                "Reorg"
+#define TM2_OPSTR_RESHAPE              "Reshape"
+#define TM2_OPSTR_ROIPOOLING           "ROIPooling"
+#define TM2_OPSTR_RPN                  "RPN"
+#define TM2_OPSTR_SCALE                "Scale"
+#define TM2_OPSTR_SLICE                "Slice"
+#define TM2_OPSTR_SOFTMAX              "Softmax"
+#define TM2_OPSTR_SPLIT                "Split"
 #define TM2_OPSTR_DETECTIONPOSTPROCESS "DetectionPostProcess"
-#define TM2_OPSTR_GEMM "Gemm"
-#define TM2_OPSTR_GENERIC "Generic"
-#define TM2_OPSTR_LOGISTIC "Logistic"
-#define TM2_OPSTR_LSTM "LSTM"
-#define TM2_OPSTR_RNN "RNN"
-#define TM2_OPSTR_TANH "Tanh"
-#define TM2_OPSTR_SIGMOID "Sigmoid"
-#define TM2_OPSTR_SQUEEZE "Squeeze"
-#define TM2_OPSTR_PAD "Pad"
-#define TM2_OPSTR_STRIDEDSLICE "StridedSlice"
-#define TM2_OPSTR_REDUCTION "Reduction"
-#define TM2_OPSTR_ARGMAX "ArgMax"
-#define TM2_OPSTR_ARGMIN "ArgMin"
-#define TM2_OPSTR_TOPKV2 "TopKV2"
-#define TM2_OPSTR_MAX "Maximum"
-#define TM2_OPSTR_MIN "Minimum"
-#define TM2_OPSTR_ADDN "Addn"
-#define TM2_OPSTR_SWAPAXIS "SwapAxis"
-#define TM2_OPSTR_GRU "GRU"
-#define TM2_OPSTR_FUSEDBNSCALERELU "Fused.BNScaleReLu"
-#define TM2_OPSTR_UPSAMPLE "Upsample"
-#define TM2_OPSTR_SHUFFLECHANNEL "ShuffleChannel"
-#define TM2_OPSTR_RESIZE "Resize"
-#define TM2_OPSTR_SPACETOBATCHND "SpaceToBatchND"
-#define TM2_OPSTR_BATCHTOSPACEND "BatchToSpaceND"
-#define TM2_OPSTR_CROP "Crop"
-#define TM2_OPSTR_PSROIPOOLING "Psroipooling"
-#define TM2_OPSTR_ROIALIGN "Roialign"
-#define TM2_OPSTR_EXPANDDIMS "Expanddims"
-#define TM2_OPSTR_UNARY "Unary"
-#define TM2_OPSTR_BIAS "Bias"
-#define TM2_OPSTR_NOOP "Noop"
-#define TM2_OPSTR_THRESHOLD "Threshold"
-#define TM2_OPSTR_HARDSIGMOID "Hardsigmoid"
-#define TM2_OPSTR_EMBED "Embedding"
-#define TM2_OPSTR_INSTANCENORM "InstanceNorm"
-#define TM2_OPSTR_MVN "MVN"
-#define TM2_OPSTR_ABSVAL "Absval"
-#define TM2_OPSTR_CAST "Cast"
-#define TM2_OPSTR_HARDSWISH "HardSwish"
-#define TM2_OPSTR_INTERP "Interp"
-#define TM2_OPSTR_SELU "Selu"
-#define TM2_OPSTR_ELU "Elu"
-#define TM2_OPSTR_BROADMUL "BroadMul"
-#define TM2_OPSTR_LOGICAL "Logical"
-#define TM2_OPSTR_GATHER "Gather"
-#define TM2_OPSTR_TRANSPOSE "Transpose"
-#define TM2_OPSTR_REVERSE "Reverse"
-#define TM2_OPSTR_COMPARISON "Comparison"
-#define TM2_OPSTR_SPACETODEPTH "SpaceToDepth"
-#define TM2_OPSTR_DEPTHTOSPACE "DepthToSpace"
-#define TM2_OPSTR_SQUAREDDIFFERENCE "SquaredDifference"
-#define TM2_OPSTR_SPARSETODENSE "SparseToDense"
-#define TM2_OPSTR_CEIL "Ceil"
-#define TM2_OPSTR_ROUND "Round"
-#define TM2_OPSTR_ZEROSLIKE "ZerosLike"
-#define TM2_OPSTR_CLIP "Clip"
-#define TM2_OPSTR_UNSQUEEZE "Unsqueeze"
-#define TM2_OPSTR_REDUCEL2 "ReduceL2"
-#define TM2_OPSTR_MEAN "Mean"
-#define TM2_OPSTR_EXPAND "Expand"
-#define TM2_OPSTR_MATMUL "MatMul"
-#define TM2_OPSTR_SCATTER "Scatter"
-#define TM2_OPSTR_SHAPE "Shape"
-#define TM2_OPSTR_WHERE "Where"
-#define TM2_OPSTR_TILE "Tile"
+#define TM2_OPSTR_GEMM                 "Gemm"
+#define TM2_OPSTR_GENERIC              "Generic"
+#define TM2_OPSTR_LOGISTIC             "Logistic"
+#define TM2_OPSTR_LSTM                 "LSTM"
+#define TM2_OPSTR_RNN                  "RNN"
+#define TM2_OPSTR_TANH                 "Tanh"
+#define TM2_OPSTR_SIGMOID              "Sigmoid"
+#define TM2_OPSTR_SQUEEZE              "Squeeze"
+#define TM2_OPSTR_PAD                  "Pad"
+#define TM2_OPSTR_STRIDEDSLICE         "StridedSlice"
+#define TM2_OPSTR_REDUCTION            "Reduction"
+#define TM2_OPSTR_ARGMAX               "ArgMax"
+#define TM2_OPSTR_ARGMIN               "ArgMin"
+#define TM2_OPSTR_TOPKV2               "TopKV2"
+#define TM2_OPSTR_MAX                  "Maximum"
+#define TM2_OPSTR_MIN                  "Minimum"
+#define TM2_OPSTR_ADDN                 "Addn"
+#define TM2_OPSTR_SWAPAXIS             "SwapAxis"
+#define TM2_OPSTR_GRU                  "GRU"
+#define TM2_OPSTR_FUSEDBNSCALERELU     "Fused.BNScaleReLu"
+#define TM2_OPSTR_UPSAMPLE             "Upsample"
+#define TM2_OPSTR_SHUFFLECHANNEL       "ShuffleChannel"
+#define TM2_OPSTR_RESIZE               "Resize"
+#define TM2_OPSTR_SPACETOBATCHND       "SpaceToBatchND"
+#define TM2_OPSTR_BATCHTOSPACEND       "BatchToSpaceND"
+#define TM2_OPSTR_CROP                 "Crop"
+#define TM2_OPSTR_PSROIPOOLING         "Psroipooling"
+#define TM2_OPSTR_ROIALIGN             "Roialign"
+#define TM2_OPSTR_EXPANDDIMS           "Expanddims"
+#define TM2_OPSTR_UNARY                "Unary"
+#define TM2_OPSTR_BIAS                 "Bias"
+#define TM2_OPSTR_NOOP                 "Noop"
+#define TM2_OPSTR_THRESHOLD            "Threshold"
+#define TM2_OPSTR_HARDSIGMOID          "Hardsigmoid"
+#define TM2_OPSTR_EMBED                "Embedding"
+#define TM2_OPSTR_INSTANCENORM         "InstanceNorm"
+#define TM2_OPSTR_MVN                  "MVN"
+#define TM2_OPSTR_ABSVAL               "Absval"
+#define TM2_OPSTR_CAST                 "Cast"
+#define TM2_OPSTR_HARDSWISH            "HardSwish"
+#define TM2_OPSTR_INTERP               "Interp"
+#define TM2_OPSTR_SELU                 "Selu"
+#define TM2_OPSTR_ELU                  "Elu"
+#define TM2_OPSTR_BROADMUL             "BroadMul"
+#define TM2_OPSTR_LOGICAL              "Logical"
+#define TM2_OPSTR_GATHER               "Gather"
+#define TM2_OPSTR_TRANSPOSE            "Transpose"
+#define TM2_OPSTR_REVERSE              "Reverse"
+#define TM2_OPSTR_COMPARISON           "Comparison"
+#define TM2_OPSTR_SPACETODEPTH         "SpaceToDepth"
+#define TM2_OPSTR_DEPTHTOSPACE         "DepthToSpace"
+#define TM2_OPSTR_SQUAREDDIFFERENCE    "SquaredDifference"
+#define TM2_OPSTR_SPARSETODENSE        "SparseToDense"
+#define TM2_OPSTR_CEIL                 "Ceil"
+#define TM2_OPSTR_ROUND                "Round"
+#define TM2_OPSTR_ZEROSLIKE            "ZerosLike"
+#define TM2_OPSTR_CLIP                 "Clip"
+#define TM2_OPSTR_UNSQUEEZE            "Unsqueeze"
+#define TM2_OPSTR_REDUCEL2             "ReduceL2"
+#define TM2_OPSTR_MEAN                 "Mean"
+#define TM2_OPSTR_EXPAND               "Expand"
+#define TM2_OPSTR_MATMUL               "MatMul"
+#define TM2_OPSTR_SCATTER              "Scatter"
+#define TM2_OPSTR_SHAPE                "Shape"
+#define TM2_OPSTR_WHERE                "Where"
+#define TM2_OPSTR_TILE                 "Tile"
 /* Operator types */
-#define TM2_OPTYPE_ACCURACY 0 /* No Param                 */
-#define TM2_OPTYPE_BATCHNORMALIZATION 1 /* TM2_BatchNormParam       */
-#define TM2_OPTYPE_BILINEARRESIZE 2 /* TM2_ResizeParam          */
-#define TM2_OPTYPE_CONCAT 3 /* TM2_ConcatParam          */
-#define TM2_OPTYPE_CONST 4 /* No Param                 */
-#define TM2_OPTYPE_CONVOLUTION 5 /* TM2_ConvParam            */
-#define TM2_OPTYPE_DECONVOLUTION 6 /* TM2_DeconvParam          */
-#define TM2_OPTYPE_DETECTIONOUTPUT 7 /* TM2_DetectionOutputParam */
-#define TM2_OPTYPE_DROPOUT 8 /* No Param                 */
-#define TM2_OPTYPE_ELTWISE 9 /* TM2_EltwiseParam         */
-#define TM2_OPTYPE_FLATTEN 10 /* TM2_FlattenParam         */
-#define TM2_OPTYPE_FULLYCONNECTED 11 /* TM2_FCParam              */
-#define TM2_OPTYPE_INPUTOP 12 /* No Param                 */
-#define TM2_OPTYPE_LRN 13 /* TM2_LRNParam             */
-#define TM2_OPTYPE_NORMALIZE 14 /* TM2_NormalizeParam       */
-#define TM2_OPTYPE_PERMUTE 15 /* TM2_PermuteParam         */
-#define TM2_OPTYPE_POOLING 16 /* TM2_PoolParam            */
-#define TM2_OPTYPE_PRELU 17 /* No Param                 */
-#define TM2_OPTYPE_PRIORBOX 18 /* TM2_PriorBoxParam        */
-#define TM2_OPTYPE_REGION 19 /* TM2_RegionParam          */
-#define TM2_OPTYPE_RELU 20 /* TM2_ReLuParam            */
-#define TM2_OPTYPE_RELU6 21 /* No Param                 */
-#define TM2_OPTYPE_REORG 22 /* TM2_ReorgParam           */
-#define TM2_OPTYPE_RESHAPE 23 /* TM2_ReshapeParam         */
-#define TM2_OPTYPE_ROIPOOLING 24 /* TM2_ROIPoolingParam      */
-#define TM2_OPTYPE_RPN 25 /* TM2_RPNParam             */
-#define TM2_OPTYPE_SCALE 26 /* TM2_ScaleParam           */
-#define TM2_OPTYPE_SLICE 27 /* TM2_SliceParam           */
-#define TM2_OPTYPE_SOFTMAX 28 /* TM2_SoftmaxParam         */
-#define TM2_OPTYPE_SPLIT 29 /* No Param                 */
+#define TM2_OPTYPE_ACCURACY             0  /* No Param                 */
+#define TM2_OPTYPE_BATCHNORMALIZATION   1  /* TM2_BatchNormParam       */
+#define TM2_OPTYPE_BILINEARRESIZE       2  /* TM2_ResizeParam          */
+#define TM2_OPTYPE_CONCAT               3  /* TM2_ConcatParam          */
+#define TM2_OPTYPE_CONST                4  /* No Param                 */
+#define TM2_OPTYPE_CONVOLUTION          5  /* TM2_ConvParam            */
+#define TM2_OPTYPE_DECONVOLUTION        6  /* TM2_DeconvParam          */
+#define TM2_OPTYPE_DETECTIONOUTPUT      7  /* TM2_DetectionOutputParam */
+#define TM2_OPTYPE_DROPOUT              8  /* No Param                 */
+#define TM2_OPTYPE_ELTWISE              9  /* TM2_EltwiseParam         */
+#define TM2_OPTYPE_FLATTEN              10 /* TM2_FlattenParam         */
+#define TM2_OPTYPE_FULLYCONNECTED       11 /* TM2_FCParam              */
+#define TM2_OPTYPE_INPUTOP              12 /* No Param                 */
+#define TM2_OPTYPE_LRN                  13 /* TM2_LRNParam             */
+#define TM2_OPTYPE_NORMALIZE            14 /* TM2_NormalizeParam       */
+#define TM2_OPTYPE_PERMUTE              15 /* TM2_PermuteParam         */
+#define TM2_OPTYPE_POOLING              16 /* TM2_PoolParam            */
+#define TM2_OPTYPE_PRELU                17 /* No Param                 */
+#define TM2_OPTYPE_PRIORBOX             18 /* TM2_PriorBoxParam        */
+#define TM2_OPTYPE_REGION               19 /* TM2_RegionParam          */
+#define TM2_OPTYPE_RELU                 20 /* TM2_ReLuParam            */
+#define TM2_OPTYPE_RELU6                21 /* No Param                 */
+#define TM2_OPTYPE_REORG                22 /* TM2_ReorgParam           */
+#define TM2_OPTYPE_RESHAPE              23 /* TM2_ReshapeParam         */
+#define TM2_OPTYPE_ROIPOOLING           24 /* TM2_ROIPoolingParam      */
+#define TM2_OPTYPE_RPN                  25 /* TM2_RPNParam             */
+#define TM2_OPTYPE_SCALE                26 /* TM2_ScaleParam           */
+#define TM2_OPTYPE_SLICE                27 /* TM2_SliceParam           */
+#define TM2_OPTYPE_SOFTMAX              28 /* TM2_SoftmaxParam         */
+#define TM2_OPTYPE_SPLIT                29 /* No Param                 */
 #define TM2_OPTYPE_DETECTIONPOSTPROCESS 30 /* TM2_DetectionPostProcessParam */
-#define TM2_OPTYPE_GEMM 31 /* TM2_GemmParam            */
-#define TM2_OPTYPE_GENERIC 32 /* TM2_GenericParam         */
-#define TM2_OPTYPE_LOGISTIC 33 /* No Param                 */
-#define TM2_OPTYPE_LSTM 34 /* TM2_LstmParam            */
-#define TM2_OPTYPE_RNN 35 /* TM2_RnnParam             */
-#define TM2_OPTYPE_TANH 36 /* No Param                 */
-#define TM2_OPTYPE_SIGMOID 37 /* No Param                 */
-#define TM2_OPTYPE_SQUEEZE 38 /* TM2_SqueezeParam         */
-#define TM2_OPTYPE_FUSEDBNSCALERELU 39 /* No Param                 */
-#define TM2_OPTYPE_PAD 40 /* TM2_PadParam                 */
-#define TM2_OPTYPE_STRIDEDSLICE 41 /* TM2_StrideSliceParam                 */
-#define TM2_OPTYPE_ARGMAX 42 /* TM2_ArgmaxParam                 */
-#define TM2_OPTYPE_ARGMIN 43 /* TM2_ArgminParam                 */
-#define TM2_OPTYPE_TOPKV2 44 /* TM2_TopkV2Param                 */
-#define TM2_OPTYPE_REDUCTION 45 /* TM2_ReductionParam                   */
-#define TM2_OPTYPE_MAX 46 /* No Param                   */
-#define TM2_OPTYPE_MIN 47 /* No Param                   */
-#define TM2_OPTYPE_GRU 48 /* TM2_GruParam                 */
-#define TM2_OPTYPE_ADDN 49 /* TM2_AddNParam         */
-#define TM2_OPTYPE_SWAPAXIS 50 /* TM2_SwapAixsParam         */
-#define TM2_OPTYPE_UPSAMPLE 51 /* TM2_UpsampleParam        */
-#define TM2_OPTYPE_SPACETOBATCHND 52
-#define TM2_OPTYPE_BATCHTOSPACEND 53
-#define TM2_OPTYPE_RESIZE 54
-#define TM2_OPTYPE_SHUFFLECHANNEL 55 /* TM2_ShuffleChannelParam        */
-#define TM2_OPTYPE_CROP 56  /* TM2_CropParam */
-#define TM2_OPTYPE_ROIALIGN 57
-#define TM2_OPTYPE_PSROIPOOLING 58
-#define TM2_OPTYPE_UNARY 59
-#define TM2_OPTYPE_EXPANDDIMS 60
-#define TM2_OPTYPE_BIAS 61
-#define TM2_OPTYPE_NOOP 62
-#define TM2_OPTYPE_THRESHOLD 63
-#define TM2_OPTYPE_HARDSIGMOID 64
-#define TM2_OPTYPE_EMBED 65
-#define TM2_OPTYPE_INSTANCENORM 66
-#define TM2_OPTYPE_MVN 67
-#define TM2_OPTYPE_ABSVAL 68
-#define TM2_OPTYPE_CAST 69
-#define TM2_OPTYPE_HARDSWISH 70
-#define TM2_OPTYPE_INTERP 71
-#define TM2_OPTYPE_SELU 72
-#define TM2_OPTYPE_ELU 73
-#define TM2_OPTYPE_BROADMUL 74
-#define TM2_OPTYPE_LOGICAL 75
-#define TM2_OPTYPE_GATHER 76
-#define TM2_OPTYPE_TRANSPOSE 77
-#define TM2_OPTYPE_COMPARISON 78
-#define TM2_OPTYPE_SPACETODEPTH 79
-#define TM2_OPTYPE_DEPTHTOSPACE 80
-#define TM2_OPTYPE_REVERSE 81
-#define TM2_OPTYPE_SPARSETODENSE 82
-#define TM2_OPTYPE_CEIL 83
-#define TM2_OPTYPE_SQUAREDDIFFERENCE 84
-#define TM2_OPTYPE_ROUND 85
-#define TM2_OPTYPE_ZEROSLIKE 86
-#define TM2_OPTYPE_CLIP 87
-#define TM2_OPTYPE_UNSQUEEZE 88
-#define TM2_OPTYPE_REDUCEL2 89
-#define TM2_OPTYPE_MEAN 90
-#define TM2_OPTYPE_EXPAND 91
-#define TM2_OPTYPE_MATMUL 92
-#define TM2_OPTYPE_SCATTER 93
-#define TM2_OPTYPE_SHAPE 94
-#define TM2_OPTYPE_WHERE 95
-#define TM2_OPTYPE_TILE 96
-#define TM2_OPTYPE_MISH 97 /* No param*/
-#define TM2_OPTYPE_NUM 98
+#define TM2_OPTYPE_GEMM                 31 /* TM2_GemmParam            */
+#define TM2_OPTYPE_GENERIC              32 /* TM2_GenericParam         */
+#define TM2_OPTYPE_LOGISTIC             33 /* No Param                 */
+#define TM2_OPTYPE_LSTM                 34 /* TM2_LstmParam            */
+#define TM2_OPTYPE_RNN                  35 /* TM2_RnnParam             */
+#define TM2_OPTYPE_TANH                 36 /* No Param                 */
+#define TM2_OPTYPE_SIGMOID              37 /* No Param                 */
+#define TM2_OPTYPE_SQUEEZE              38 /* TM2_SqueezeParam         */
+#define TM2_OPTYPE_FUSEDBNSCALERELU     39 /* No Param                 */
+#define TM2_OPTYPE_PAD                  40 /* TM2_PadParam                 */
+#define TM2_OPTYPE_STRIDEDSLICE         41 /* TM2_StrideSliceParam                 */
+#define TM2_OPTYPE_ARGMAX               42 /* TM2_ArgmaxParam                 */
+#define TM2_OPTYPE_ARGMIN               43 /* TM2_ArgminParam                 */
+#define TM2_OPTYPE_TOPKV2               44 /* TM2_TopkV2Param                 */
+#define TM2_OPTYPE_REDUCTION            45 /* TM2_ReductionParam                   */
+#define TM2_OPTYPE_MAX                  46 /* No Param                   */
+#define TM2_OPTYPE_MIN                  47 /* No Param                   */
+#define TM2_OPTYPE_GRU                  48 /* TM2_GruParam                 */
+#define TM2_OPTYPE_ADDN                 49 /* TM2_AddNParam         */
+#define TM2_OPTYPE_SWAPAXIS             50 /* TM2_SwapAixsParam         */
+#define TM2_OPTYPE_UPSAMPLE             51 /* TM2_UpsampleParam        */
+#define TM2_OPTYPE_SPACETOBATCHND       52
+#define TM2_OPTYPE_BATCHTOSPACEND       53
+#define TM2_OPTYPE_RESIZE               54
+#define TM2_OPTYPE_SHUFFLECHANNEL       55 /* TM2_ShuffleChannelParam        */
+#define TM2_OPTYPE_CROP                 56 /* TM2_CropParam */
+#define TM2_OPTYPE_ROIALIGN             57
+#define TM2_OPTYPE_PSROIPOOLING         58
+#define TM2_OPTYPE_UNARY                59
+#define TM2_OPTYPE_EXPANDDIMS           60
+#define TM2_OPTYPE_BIAS                 61
+#define TM2_OPTYPE_NOOP                 62
+#define TM2_OPTYPE_THRESHOLD            63
+#define TM2_OPTYPE_HARDSIGMOID          64
+#define TM2_OPTYPE_EMBED                65
+#define TM2_OPTYPE_INSTANCENORM         66
+#define TM2_OPTYPE_MVN                  67
+#define TM2_OPTYPE_ABSVAL               68
+#define TM2_OPTYPE_CAST                 69
+#define TM2_OPTYPE_HARDSWISH            70
+#define TM2_OPTYPE_INTERP               71
+#define TM2_OPTYPE_SELU                 72
+#define TM2_OPTYPE_ELU                  73
+#define TM2_OPTYPE_BROADMUL             74
+#define TM2_OPTYPE_LOGICAL              75
+#define TM2_OPTYPE_GATHER               76
+#define TM2_OPTYPE_TRANSPOSE            77
+#define TM2_OPTYPE_COMPARISON           78
+#define TM2_OPTYPE_SPACETODEPTH         79
+#define TM2_OPTYPE_DEPTHTOSPACE         80
+#define TM2_OPTYPE_REVERSE              81
+#define TM2_OPTYPE_SPARSETODENSE        82
+#define TM2_OPTYPE_CEIL                 83
+#define TM2_OPTYPE_SQUAREDDIFFERENCE    84
+#define TM2_OPTYPE_ROUND                85
+#define TM2_OPTYPE_ZEROSLIKE            86
+#define TM2_OPTYPE_CLIP                 87
+#define TM2_OPTYPE_UNSQUEEZE            88
+#define TM2_OPTYPE_REDUCEL2             89
+#define TM2_OPTYPE_MEAN                 90
+#define TM2_OPTYPE_EXPAND               91
+#define TM2_OPTYPE_MATMUL               92
+#define TM2_OPTYPE_SCATTER              93
+#define TM2_OPTYPE_SHAPE                94
+#define TM2_OPTYPE_WHERE                95
+#define TM2_OPTYPE_TILE                 96
+#define TM2_OPTYPE_MISH                 97 /* No param*/
+#define TM2_OPTYPE_NUM                  98
 
 /* --------------------- -------- TM objects -------------------------------- */
 
 typedef struct
 {
-    uint16_t ver_main; /* main version of Tengine model file format */
-    uint16_t ver_sub; /* sub version of Tengine model file format */
-    uint16_t ver_compile; /* compile version of Tengine model file format */
+    uint16_t ver_main;        /* main version of Tengine model file format */
+    uint16_t ver_sub;         /* sub version of Tengine model file format */
+    uint16_t ver_compile;     /* compile version of Tengine model file format */
     tm_uoffset_t offset_root; /* offset of root table (TM2_Model) */
 } TM2_Header;
 
 /* Root table of Tengine model */
 typedef struct
 {
-    int32_t orig_format; /* format of original model */
-    int32_t sub_format; /* sub format for DLA model */
+    int32_t orig_format;              /* format of original model */
+    int32_t sub_format;               /* sub format for DLA model */
     tm_uoffset_t offset_vo_subgraphs; /* offset of TM2_Vector_offsets <offsets of subgraphs> */
-    tm_uoffset_t offset_s_mname; /* offset of string <model name> */
+    tm_uoffset_t offset_s_mname;      /* offset of string <model name> */
 } TM2_Model;
 
 /* Only 1 subgraph is supported currently */
 typedef struct
 {
-    uint32_t subgraph_id; /* subgraph id */
-    int32_t graph_layout; /* actual data layout */
-    int32_t model_layout; /* data layout of original model */
-    tm_uoffset_t offset_vi_input_indices; /* offset of TM2_Vector_indices <indices of input nodes> */
+    uint32_t subgraph_id;                  /* subgraph id */
+    int32_t graph_layout;                  /* actual data layout */
+    int32_t model_layout;                  /* data layout of original model */
+    tm_uoffset_t offset_vi_input_indices;  /* offset of TM2_Vector_indices <indices of input nodes> */
     tm_uoffset_t offset_vi_output_indices; /* offset of TM2_Vector_indices <indices of output nodes> */
-    tm_uoffset_t offset_vo_seq_nodes; /* offset of TM2_Vector_offsets <nodes> */
-    tm_uoffset_t offset_vo_tensors; /* offset of TM2_Vector_offsets <tensors> */
-    tm_uoffset_t offset_vo_buffers; /* offset of TM2_Vector_offsets <buffers> */
-    tm_uoffset_t offset_s_sname; /* offset of string <subgraph name> */
-    tm_uoffset_t offset_vo_sub_info; /* offset of TM2_Vector_offsets <sub graph infomation> */
+    tm_uoffset_t offset_vo_seq_nodes;      /* offset of TM2_Vector_offsets <nodes> */
+    tm_uoffset_t offset_vo_tensors;        /* offset of TM2_Vector_offsets <tensors> */
+    tm_uoffset_t offset_vo_buffers;        /* offset of TM2_Vector_offsets <buffers> */
+    tm_uoffset_t offset_s_sname;           /* offset of string <subgraph name> */
+    tm_uoffset_t offset_vo_sub_info;       /* offset of TM2_Vector_offsets <sub graph infomation> */
 } TM2_Subgraph;
 
 typedef struct
 {
-    uint32_t subgraph_id; /* sub graph idx */
-    uint32_t input_wait_count; /* input wait count */
-    int32_t data_type;         /* FP32 FP16 U8 INT8 */
+    uint32_t subgraph_id;                 /* sub graph idx */
+    uint32_t input_wait_count;            /* input wait count */
+    int32_t data_type;                    /* FP32 FP16 U8 INT8 */
     tm_uoffset_t offset_vi_node_list;     /* offset of TM2_Vector_indices <indices of node list> */
     tm_uoffset_t offset_vi_input_tensor;  /* offset of TM2_Vector_indices <indices of input node> */
     tm_uoffset_t offset_vi_output_tensor; /* offset of TM2_Vector_indices <indices of output node> */
@@ -292,25 +292,25 @@ typedef struct
 typedef struct
 {
     tm_uoffset_t offset_s_attrname; /* offset of string <attr name> */
-    tm_uoffset_t offset_s_attrval; /* offset of string <attr value> */
+    tm_uoffset_t offset_s_attrval;  /* offset of string <attr value> */
     int32_t attr_type;
 } TM2_Attr;
 
 typedef struct
 {
-    uint32_t node_id; /* node id */
-    tm_uoffset_t offset_vi_input_tensors; /* offset of TM2_Vector_indices <indices of input tensors> */
+    uint32_t node_id;                      /* node id */
+    tm_uoffset_t offset_vi_input_tensors;  /* offset of TM2_Vector_indices <indices of input tensors> */
     tm_uoffset_t offset_vi_output_tensors; /* offset of TM2_Vector_indices <indices of output tensors> */
-    tm_uoffset_t offset_t_operator; /* offset of table  <operator> */
-    tm_uoffset_t offset_s_nname; /* offset of string <node name> */
-    tm_uoffset_t offset_vo_attrs; /* offset of TM2_Vector_offsets <attrs> */
+    tm_uoffset_t offset_t_operator;        /* offset of table  <operator> */
+    tm_uoffset_t offset_s_nname;           /* offset of string <node name> */
+    tm_uoffset_t offset_vo_attrs;          /* offset of TM2_Vector_offsets <attrs> */
     tm_bool_t dynamic_shape;
 } TM2_Node;
 
 typedef struct
 {
-    uint32_t op_ver; /* version of operator */
-    uint32_t operator_type; /* operator type */
+    uint32_t op_ver;             /* version of operator */
+    uint32_t operator_type;      /* operator type */
     tm_uoffset_t offset_t_param; /* offset of table <operator param> */
 } TM2_Operator;
 
@@ -325,8 +325,8 @@ typedef struct
 {
     uint32_t tensor_id;
     uint32_t buffer_id;
-    tm_uoffset_t offset_vd_dims; /* offset of TM2_Vector_dims <dims> */
-    tm_uoffset_t offset_s_tname; /* offset of string <tensor name> */
+    tm_uoffset_t offset_vd_dims;        /* offset of TM2_Vector_dims <dims> */
+    tm_uoffset_t offset_s_tname;        /* offset of string <tensor name> */
     tm_uoffset_t offect_vo_quantparams; /* offset of TM2_Vector_offsets <quant params> */
     int32_t layout;
     int32_t type;
@@ -335,13 +335,13 @@ typedef struct
 
 typedef struct
 {
-    tm_size_t size; /* buffer size */
+    tm_size_t size;           /* buffer size */
     tm_uoffset_t offset_data; /* offset of buffer data */
 } TM2_Buffer;
 
 typedef struct
 {
-    tm_size_t size; /* string size */
+    tm_size_t size;           /* string size */
     tm_uoffset_t offset_data; /* offset of string data */
 } TM2_String;
 
@@ -373,7 +373,7 @@ typedef struct
 
 typedef struct
 {
-    tm_size_t v_num; /* number of vector elements */
+    tm_size_t v_num;  /* number of vector elements */
     float data[0][4]; /* x0, y0, x1, y1 */
 } TM2_Vector_anchors;
 
@@ -504,9 +504,9 @@ typedef struct
 
 typedef struct
 {
-    tm_uoffset_t offset_vf_min_size; /* offset of TM2_Vector_floats <min_sizes> */
-    tm_uoffset_t offset_vf_max_size; /* offset of TM2_Vector_floats <max_sizes> */
-    tm_uoffset_t offset_vf_variance; /* offset of TM2_Vector_floats <variances> */
+    tm_uoffset_t offset_vf_min_size;     /* offset of TM2_Vector_floats <min_sizes> */
+    tm_uoffset_t offset_vf_max_size;     /* offset of TM2_Vector_floats <max_sizes> */
+    tm_uoffset_t offset_vf_variance;     /* offset of TM2_Vector_floats <variances> */
     tm_uoffset_t offset_vf_aspect_ratio; /* offset of TM2_Vector_floats <aspect_ratios> */
     int32_t flip;
     int32_t clip;
@@ -564,7 +564,7 @@ typedef struct
 
 typedef struct
 {
-    tm_uoffset_t offset_vf_ratios; /* pointer to TM2_Vector_floats <ratios> */
+    tm_uoffset_t offset_vf_ratios;        /* pointer to TM2_Vector_floats <ratios> */
     tm_uoffset_t offset_vf_anchor_scales; /* pointer to TM2_Vector_floats <anchor_scales> */
     int32_t feat_stride;
     int32_t basesize;
@@ -586,8 +586,8 @@ typedef struct
 {
     int32_t axis;
     tm_uoffset_t offset_vi_slice_points; /* offset of TM2_Vector_dims <slice_points> */
-    tm_uoffset_t offset_vi_begins; /* offset of TM2_Vector_dims <begins> */
-    tm_uoffset_t offset_vi_sizes; /* offset of TM2_Vector_dims <sizes> */
+    tm_uoffset_t offset_vi_begins;       /* offset of TM2_Vector_dims <begins> */
+    tm_uoffset_t offset_vi_sizes;        /* offset of TM2_Vector_dims <sizes> */
     int32_t iscaffe;
     int32_t ismxnet;
     int32_t isonnx;
@@ -766,42 +766,40 @@ typedef struct
 
 typedef struct
 {
-   int32_t dilation_x;
-   int32_t dilation_y;
-   int32_t pad_top;
-   int32_t pad_bottom;
-   int32_t pad_left;
-   int32_t pad_right;
+    int32_t dilation_x;
+    int32_t dilation_y;
+    int32_t pad_top;
+    int32_t pad_bottom;
+    int32_t pad_left;
+    int32_t pad_right;
 
 } TM2_SpaceToBatchNDParam;
 
-
 typedef struct
 {
-   int32_t dilation_x;
-   int32_t dilation_y;
-   int32_t crop_top;
-   int32_t crop_bottom;
-   int32_t crop_left;
-   int32_t crop_right;
+    int32_t dilation_x;
+    int32_t dilation_y;
+    int32_t crop_top;
+    int32_t crop_bottom;
+    int32_t crop_left;
+    int32_t crop_right;
 
 } TM2_BatchToSpaceNDParam;
 
 typedef struct
 {
-   int32_t num_args;
-   int32_t offset_c;
-   int32_t offset_h;
-   int32_t offset_w;
-   int32_t crop_h;
-   int32_t crop_w;
-   bool center_crop;
-   int32_t axis;
-   int32_t flag;
+    int32_t num_args;
+    int32_t offset_c;
+    int32_t offset_h;
+    int32_t offset_w;
+    int32_t crop_h;
+    int32_t crop_w;
+    bool center_crop;
+    int32_t axis;
+    int32_t flag;
 } TM2_CropParam;
 
-
-typedef struct 
+typedef struct
 {
     int32_t pooled_width;
     int32_t pooled_height;
@@ -826,34 +824,33 @@ typedef struct
     int32_t type;
 } TM2_UnaryParam;
 
-
 typedef struct
 {
-   int32_t bias_size;
+    int32_t bias_size;
 } TM2_BiasParam;
 
 typedef struct
 {
-   float threshold;
+    float threshold;
 } TM2_ThresholdParam;
 
 typedef struct
 {
-   float alpha;
-   float beta;
+    float alpha;
+    float beta;
 } TM2_HardsigmoidParam;
 
 typedef struct
 {
-   int32_t num_output;
-   int32_t input_dim;
-   int32_t bias_term;
-   int32_t weight_data_size;
+    int32_t num_output;
+    int32_t input_dim;
+    int32_t bias_term;
+    int32_t weight_data_size;
 } TM2_EmbedParam;
 
 typedef struct
 {
-   float eps;
+    float eps;
 } TM2_InstanceNormParam;
 
 typedef struct
@@ -863,33 +860,37 @@ typedef struct
     float eps;
 } TM2_MVNParam;
 
-
-typedef struct{
+typedef struct
+{
     int32_t type_from;
     int32_t type_to;
-}TM2_CastParam;
+} TM2_CastParam;
 
-typedef struct{
+typedef struct
+{
     float alpha;
     float beta;
-}TM2_HardSwishParam;
+} TM2_HardSwishParam;
 
-typedef struct{
-    int32_t resize_type;//1=nearest  2=bilinear  3=bicubic
+typedef struct
+{
+    int32_t resize_type; //1=nearest  2=bilinear  3=bicubic
     float width_scale;
     float height_scale;
     int32_t output_width;
     int32_t output_height;
-}TM2_InterpParam;
+} TM2_InterpParam;
 
-typedef struct{
+typedef struct
+{
     float alpha;
     float gamma;
-}TM2_SeluParam;
+} TM2_SeluParam;
 
-typedef struct{
+typedef struct
+{
     float alpha;
-}TM2_EluParam;
+} TM2_EluParam;
 
 typedef struct
 {
@@ -902,21 +903,22 @@ typedef struct
     int32_t indices_num;
     tm_bool_t is_onnx;
 } TM2_GatherParam;
-typedef struct{
+typedef struct
+{
     tm_uoffset_t offset_tr_shape;
-}TM2_TransposeParam;
+} TM2_TransposeParam;
 typedef struct
 {
     int32_t type;
 } TM2_ComparisonParam;
 typedef struct
 {
-   int block_size;
+    int block_size;
 } TM2_SpaceToDepthParam;
 
 typedef struct
 {
-   int block_size;
+    int block_size;
 } TM2_DepthToSpaceParam;
 
 typedef struct
@@ -934,31 +936,31 @@ typedef struct
 
 typedef struct
 {
-     tm_uoffset_t offset_vi_axises;
-}TM2_UnsqueezeParam;
+    tm_uoffset_t offset_vi_axises;
+} TM2_UnsqueezeParam;
 
 typedef struct
 {
     int axis;
     int keepdim;
-}TM2_ReduceL2Param;
+} TM2_ReduceL2Param;
 
 typedef struct
 {
-     tm_uoffset_t offset_v_shape;
-}TM2_ExpandParam;
+    tm_uoffset_t offset_v_shape;
+} TM2_ExpandParam;
 
 typedef struct
 {
-     int axis;
-     tm_bool_t is_onnx;
-}TM2_ScatterParam;
+    int axis;
+    tm_bool_t is_onnx;
+} TM2_ScatterParam;
 
 typedef struct
 {
-     tm_uoffset_t offset_vi_flag;   // caffe: 0, onnx: 1
-     tm_uoffset_t offset_vi_reps;
-}TM2_TileParam;
+    tm_uoffset_t offset_vi_flag; // caffe: 0, onnx: 1
+    tm_uoffset_t offset_vi_reps;
+} TM2_TileParam;
 
 #ifdef __cplusplus
 }
diff --git a/tools/quantize/savegraph/tm2_generate.c b/tools/quantize/savegraph/tm2_generate.c
index 71db31f8b..4ba97d177 100644
--- a/tools/quantize/savegraph/tm2_generate.c
+++ b/tools/quantize/savegraph/tm2_generate.c
@@ -28,7 +28,7 @@
 extern "C" {
 #endif
 
-#define ALIGN(pos, alignbytes) (((pos) + ( alignbytes )-1) & ~(( alignbytes )-1))
+#define ALIGN(pos, alignbytes) (((pos) + (alignbytes)-1) & ~((alignbytes)-1))
 
 uint32_t WriteTmFileAlign1(void* const start_ptr, uint32_t* cur_pos, const void* buf, const uint32_t buf_size)
 {
diff --git a/tools/quantize/savegraph/tm2_op_save.cpp b/tools/quantize/savegraph/tm2_op_save.cpp
index 29e052471..a964f84b4 100644
--- a/tools/quantize/savegraph/tm2_op_save.cpp
+++ b/tools/quantize/savegraph/tm2_op_save.cpp
@@ -31,7 +31,6 @@
 #include "graph/tensor.h"
 #include "utility/log.h"
 
-
 inline void SetTmOperator(TM2_Operator* tm_op, const uint32_t op_type, const tm_uoffset_t offset)
 {
     tm_op->op_ver = TM2_OP_VER;
@@ -275,9 +274,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, stru
     TM2_PriorBoxParam tm_param;
 
     size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->min_size_num;
-    TM2_Vector_floats* v_minsizes = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_minsizes = (TM2_Vector_floats*)malloc(vector_size);
     v_minsizes->v_num = p->min_size_num;
-    for(unsigned int i = 0; i < p->min_size_num; i++)
+    for (unsigned int i = 0; i < p->min_size_num; i++)
     {
         v_minsizes->data[i] = p->min_size[i];
     }
@@ -285,9 +284,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, stru
     free(v_minsizes);
 
     vector_size = sizeof(tm_size_t) + sizeof(float) * p->max_size_num;
-    TM2_Vector_floats* v_maxsizes = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_maxsizes = (TM2_Vector_floats*)malloc(vector_size);
     v_maxsizes->v_num = p->max_size_num;
-    for(unsigned int i = 0; i < p->max_size_num; i++)
+    for (unsigned int i = 0; i < p->max_size_num; i++)
     {
         v_maxsizes->data[i] = p->max_size[i];
     }
@@ -296,9 +295,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, stru
 
     int variance_num = 4; // tengine lite does not set the variable.
     vector_size = sizeof(tm_size_t) + sizeof(float) * variance_num;
-    TM2_Vector_floats* v_variance = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_variance = (TM2_Vector_floats*)malloc(vector_size);
     v_variance->v_num = variance_num;
-    for(unsigned int i = 0; i < variance_num; i++)
+    for (unsigned int i = 0; i < variance_num; i++)
     {
         v_variance->data[i] = p->variance[i];
     }
@@ -306,9 +305,9 @@ tm_uoffset_t SaveTmPriorBoxOp(void* const start_ptr, tm_uoffset_t* cur_pos, stru
     free(v_variance);
 
     vector_size = sizeof(tm_size_t) + sizeof(float) * p->aspect_ratio_size;
-    TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_ratios = (TM2_Vector_floats*)malloc(vector_size);
     v_ratios->v_num = p->aspect_ratio_size;
-    for(unsigned int i = 0; i < p->aspect_ratio_size; i++)
+    for (unsigned int i = 0; i < p->aspect_ratio_size; i++)
     {
         v_ratios->data[i] = p->aspect_ratio[i];
     }
@@ -343,9 +342,9 @@ tm_uoffset_t SaveTmRegionOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct
     tm_param.nms_threshold = p->nms_threshold;
 
     size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->biases_num;
-    TM2_Vector_floats* v_biases = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_biases = (TM2_Vector_floats*)malloc(vector_size);
     v_biases->v_num = p->biases_num;
-    for(unsigned int i = 0; i < p->biases_num; i++)
+    for (unsigned int i = 0; i < p->biases_num; i++)
     {
         v_biases->data[i] = p->biases[i];
     }
@@ -390,36 +389,35 @@ tm_uoffset_t SaveTmReshapeOp(void* const start_ptr, tm_uoffset_t* cur_pos, struc
 {
     struct reshape_param* p = (struct reshape_param*)node->op.param_mem;
     TM2_ReshapeParam tm_param;
-    if(p->reverse)
+    if (p->reverse)
         tm_param.reverse = 1;
     else
         tm_param.reverse = 0;
-    if(p->is_mxnet)
+    if (p->is_mxnet)
         tm_param.is_mxnet = 1;
     else
         tm_param.is_mxnet = 0;
 
-    if(p->dim_size)
+    if (p->dim_size)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->dim_size;
-        TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)malloc(vector_size);
         v_re_shape->v_num = p->dim_size;
-        for(unsigned int i = 0; i < p->dim_size; i++)
+        for (unsigned int i = 0; i < p->dim_size; i++)
         {
             v_re_shape->dims[i] = p->re_shape[i];
         }
         tm_param.offset_re_shape = WriteTmObject(start_ptr, cur_pos, v_re_shape, vector_size);
         free(v_re_shape);
     }
-    else{
+    else
+    {
         tm_param.offset_re_shape = TM2_NOT_SET;
     }
 
-
     TM2_Operator tm_op;
     SetTmOperator(&tm_op, TM2_OPTYPE_RESHAPE, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ReshapeParam)));
     return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
-
 }
 
 tm_uoffset_t SaveTmResizeOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct node* node)
@@ -456,9 +454,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct no
     TM2_RPNParam tm_param;
 
     size_t vector_size = sizeof(tm_size_t) + sizeof(float) * p->ratios->elem_num;
-    TM2_Vector_floats* v_ratios = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_ratios = (TM2_Vector_floats*)malloc(vector_size);
     v_ratios->v_num = p->ratios->elem_num;
-    for(unsigned int i = 0; i < p->ratios->elem_num; i++)
+    for (unsigned int i = 0; i < p->ratios->elem_num; i++)
     {
         v_ratios->data[i] = *(float*)get_vector_data(p->ratios, i);
     }
@@ -466,9 +464,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct no
     free(v_ratios);
 
     vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchor_scales->elem_num;
-    TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_scales = (TM2_Vector_floats*)malloc(vector_size);
     v_scales->v_num = p->anchor_scales->elem_num;
-    for(unsigned int i = 0; i < p->anchor_scales->elem_num; i++)
+    for (unsigned int i = 0; i < p->anchor_scales->elem_num; i++)
     {
         v_scales->data[i] = *(float*)get_vector_data(p->anchor_scales, i);
     }
@@ -476,9 +474,9 @@ tm_uoffset_t SaveTmRPNOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct no
     free(v_scales);
 
     vector_size = sizeof(tm_size_t) + sizeof(float) * p->anchors_->elem_num * 4;
-    TM2_Vector_anchors* v_anchors = ( TM2_Vector_anchors* )malloc(vector_size);
+    TM2_Vector_anchors* v_anchors = (TM2_Vector_anchors*)malloc(vector_size);
     v_anchors->v_num = p->anchors_->elem_num;
-    for(unsigned int i = 0; i < p->anchors_->elem_num; i++)
+    for (unsigned int i = 0; i < p->anchors_->elem_num; i++)
     {
         v_anchors->data[i][0] = ((Anchor_t*)get_vector_data(p->anchors_, i))->x0;
         v_anchors->data[i][1] = ((Anchor_t*)get_vector_data(p->anchors_, i))->y0;
@@ -522,16 +520,17 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct
     tm_param.iscaffe = p->iscaffe;
     tm_param.isonnx = p->isonnx;
     tm_param.ismxnet = p->ismxnet;
-    if(!tm_param.iscaffe){
+    if (!tm_param.iscaffe)
+    {
         tm_param.begin = p->begin;
         tm_param.end = p->end;
     }
-    if(p->slice_point_->elem_num)
+    if (p->slice_point_->elem_num)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->slice_point_->elem_num;
-        TM2_Vector_dims* v_slice_points = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_slice_points = (TM2_Vector_dims*)malloc(vector_size);
         v_slice_points->v_num = p->slice_point_->elem_num;
-        for(unsigned int i = 0; i < p->slice_point_->elem_num; i++)
+        for (unsigned int i = 0; i < p->slice_point_->elem_num; i++)
         {
             v_slice_points->dims[i] = *(int32_t*)get_vector_data(p->slice_point_, i);
         }
@@ -541,12 +540,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct
     else
         tm_param.offset_vi_slice_points = TM2_NOT_SET;
 
-    if(p->begin_->elem_num)
+    if (p->begin_->elem_num)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->begin_->elem_num;
-        TM2_Vector_dims* v_begins = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_begins = (TM2_Vector_dims*)malloc(vector_size);
         v_begins->v_num = p->begin_->elem_num;
-        for(unsigned int i = 0; i < p->begin_->elem_num; i++)
+        for (unsigned int i = 0; i < p->begin_->elem_num; i++)
         {
             v_begins->dims[i] = *(int32_t*)get_vector_data(p->begin_, i);
         }
@@ -556,12 +555,12 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct
     else
         tm_param.offset_vi_begins = TM2_NOT_SET;
 
-    if(p->size_->elem_num)
+    if (p->size_->elem_num)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->size_->elem_num;
-        TM2_Vector_dims* v_sizes = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_sizes = (TM2_Vector_dims*)malloc(vector_size);
         v_sizes->v_num = p->size_->elem_num;
-        for(unsigned int i = 0; i < p->size_->elem_num; i++)
+        for (unsigned int i = 0; i < p->size_->elem_num; i++)
         {
             v_sizes->dims[i] = *(int32_t*)get_vector_data(p->size_, i);
         }
@@ -571,7 +570,6 @@ tm_uoffset_t SaveTmSliceOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct
     else
         tm_param.offset_vi_sizes = TM2_NOT_SET;
 
-
     TM2_Operator tm_op;
     SetTmOperator(&tm_op, TM2_OPTYPE_SLICE, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SliceParam)));
     return WriteTmObject(start_ptr, cur_pos, &tm_op, sizeof(TM2_Operator));
@@ -592,24 +590,27 @@ tm_uoffset_t SaveTmSplitOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct
 {
     struct split_param* p = (struct split_param*)node->op.param_mem;
     TM2_SplitParam tm_param;
-    if(p->is_caffe)
+    if (p->is_caffe)
         tm_param.is_caffe = 1;
     else
         tm_param.is_caffe = 0;
 
-    if(p->is_onnx){
+    if (p->is_onnx)
+    {
         tm_param.is_onnx = 1;
-    } else {
+    }
+    else
+    {
         tm_param.is_onnx = 0;
     }
-    if(!p->is_caffe)
+    if (!p->is_caffe)
     {
-        if(p->is_onnx)
+        if (p->is_onnx)
             tm_param.axis = p->axis;
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->split_sizes_->elem_num;
-        TM2_Vector_dims* v_split_sizes = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_split_sizes = (TM2_Vector_dims*)malloc(vector_size);
         v_split_sizes->v_num = p->split_sizes_->elem_num;
-        for(unsigned int i = 0; i < p->split_sizes_->elem_num; i++)
+        for (unsigned int i = 0; i < p->split_sizes_->elem_num; i++)
         {
             v_split_sizes->dims[i] = *(int32_t*)get_vector_data(p->split_sizes_, i);
         }
@@ -636,9 +637,9 @@ tm_uoffset_t SaveTmDetectionPostProcessOp(void* const start_ptr, tm_uoffset_t* c
 
     int param_scales_num = 4;
     size_t vector_size = sizeof(tm_size_t) + sizeof(float) * param_scales_num;
-    TM2_Vector_floats* v_scales = ( TM2_Vector_floats* )malloc(vector_size);
+    TM2_Vector_floats* v_scales = (TM2_Vector_floats*)malloc(vector_size);
     v_scales->v_num = param_scales_num;
-    for(unsigned int i = 0; i < param_scales_num; i++)
+    for (unsigned int i = 0; i < param_scales_num; i++)
     {
         v_scales->data[i] = p->scales[i];
     }
@@ -780,7 +781,7 @@ tm_uoffset_t SaveTmTopKV2Op(void* const start_ptr, tm_uoffset_t* cur_pos, struct
     TM2_TopKV2Param tm_param;
 
     tm_param.k = p->k;
-    if(p->sorted)
+    if (p->sorted)
         tm_param.sorted = 1;
     else
         tm_param.sorted = 0;
@@ -992,7 +993,7 @@ tm_uoffset_t SaveTmExpanddimsOp(void* const start_ptr, tm_uoffset_t* cur_pos, st
     struct expanddims_param* p = (struct expanddims_param*)node->op.param_mem;
     TM2_ExpanddimsParam tm_param;
 
-    tm_param.axis= p->axis;
+    tm_param.axis = p->axis;
 
     TM2_Operator tm_op;
     SetTmOperator(&tm_op, TM2_OPTYPE_EXPANDDIMS, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_ExpanddimsParam)));
@@ -1116,7 +1117,7 @@ tm_uoffset_t SaveTmSeluOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct n
     struct selu_param* p = (struct selu_param*)node->op.param_mem;
     TM2_SeluParam tm_param;
     tm_param.alpha = p->alpha;
-    tm_param.gamma = p->lambda;//gamma
+    tm_param.gamma = p->lambda; //gamma
 
     TM2_Operator tm_op;
     SetTmOperator(&tm_op, TM2_OPTYPE_SELU, WriteTmObject(start_ptr, cur_pos, &tm_param, sizeof(TM2_SeluParam)));
@@ -1169,19 +1170,20 @@ tm_uoffset_t SaveTmTransposeOp(void* const start_ptr, tm_uoffset_t* cur_pos, str
 {
     struct transpose_param* p = (struct transpose_param*)node->op.param_mem;
     TM2_TransposeParam tm_param;
-    if(p->tr_shape_size)
+    if (p->tr_shape_size)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->tr_shape_size;
-        TM2_Vector_dims* v_re_shape = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_re_shape = (TM2_Vector_dims*)malloc(vector_size);
         v_re_shape->v_num = p->tr_shape_size;
-        for(unsigned int i = 0; i < p->tr_shape_size; i++)
+        for (unsigned int i = 0; i < p->tr_shape_size; i++)
         {
             v_re_shape->dims[i] = p->tr_shape[i];
         }
         tm_param.offset_tr_shape = WriteTmObject(start_ptr, cur_pos, v_re_shape, vector_size);
         free(v_re_shape);
     }
-    else{
+    else
+    {
         tm_param.offset_tr_shape = TM2_NOT_SET;
     }
     TM2_Operator tm_op;
@@ -1281,12 +1283,12 @@ tm_uoffset_t SaveTmUnsqueezeOp(void* const start_ptr, tm_uoffset_t* cur_pos, str
     struct unsqueeze_param* p = (struct unsqueeze_param*)node->op.param_mem;
     TM2_UnsqueezeParam tm_param;
 
-    if(p->axises_size)
+    if (p->axises_size)
     {
         size_t vector_size = sizeof(tm_size_t) + sizeof(int32_t) * p->axises_size;
-        TM2_Vector_dims* v_axises = ( TM2_Vector_dims* )malloc(vector_size);
+        TM2_Vector_dims* v_axises = (TM2_Vector_dims*)malloc(vector_size);
         v_axises->v_num = p->axises_size;
-        for(unsigned int i = 0; i < p->axises_size; i++)
+        for (unsigned int i = 0; i < p->axises_size; i++)
         {
             v_axises->dims[i] = p->axises[i];
         }
@@ -1330,178 +1332,178 @@ tm_uoffset_t SaveTmMatMulOp(void* const start_ptr, tm_uoffset_t* cur_pos, struct
 
 op_save_t SaveTmOpFunc(uint32_t op_type)
 {
-    switch(op_type)
+    switch (op_type)
     {
-        case OP_BATCHNORM:
-            return SaveTmBatchNormOp;
-        case OP_CONCAT:
-            return SaveTmConcatOp;
-        case OP_CONST:
-            return SaveTmConstOp;
-        case OP_CONV:
-            return SaveTmConvOp;
-        case OP_DECONV:
-            return SaveTmDeconvOp;
-        case OP_DETECTION_OUTPUT:
-            return SaveTmDetectionOutputOp;
-        case OP_DROPOUT:
-            return SaveTmDropoutOp;
-        case OP_ELTWISE:
-            return SaveTmEltwiseOp;
-        case OP_FLATTEN:
-            return SaveTmFlattenOp;
-        case OP_FC:
-            return SaveTmFCOp;
-        case OP_INPUT:
-            return SaveTmInputOp;
-        case OP_LRN:
-            return SaveTmLRNOp;
-        case OP_NORMALIZE:
-            return SaveTmNormalizeOp;
-        case OP_PERMUTE:
-            return SaveTmPermuteOp;
-        case OP_POOL:
-            return SaveTmPoolingOp;
-        case OP_PRELU:
-            return SaveTmPreluOp;
-        case OP_PRIORBOX:
-            return SaveTmPriorBoxOp;
-        case OP_REGION:
-            return SaveTmRegionOp;
-        case OP_RELU:
-            return SaveTmReLuOp;
-        case OP_RELU6:
-            return SaveTmRelu6Op;
-        case OP_REORG:
-            return SaveTmReorgOp;
-        case OP_RESHAPE:
-            return SaveTmReshapeOp;
-        case OP_ROIPOOLING:
-            return SaveTmROIPoolingOp;
-        case OP_RPN:
-            return SaveTmRPNOp;
-        case OP_SCALE:
-            return SaveTmScaleOp;
-        case OP_SLICE:
-            return SaveTmSliceOp;
-        case OP_SOFTMAX:
-            return SaveTmSoftmaxOp;
-        case OP_SPLIT:
-            return SaveTmSplitOp;
-        case OP_DETECTION_POSTPROCESS:
-            return SaveTmDetectionPostProcessOp;
-        case OP_GEMM:
-            return SaveTmGemmOp;
-        case OP_LOGISTIC:
-            return SaveTmLogisticOp;
-        case OP_LSTM:
-            return SaveTmLstmOp;
-        case OP_RNN:
-            return SaveTmRnnOp;
-        case OP_TANH:
-            return SaveTmTanhOp;
-        case OP_SIGMOID:
-            return SaveTmSigmoidOp;
-        case OP_SQUEEZE:
-            return SaveTmSqueezeOp;
-        case OP_SWAP_AXIS:
-            return SaveTmSwapAxisOp;
-        case OP_GRU:
-            return SaveTmGruOp;
-        case OP_ARGMAX:
-            return SaveTmArgMaxOp;
-        case OP_ARGMIN:
-            return SaveTmArgMinOp;
-        case OP_TOPKV2:
-            return SaveTmTopKV2Op;
-        case OP_PAD:
-            return SaveTmPadOp;
-        case OP_STRIDED_SLICE:
-            return SaveTmStridedSliceOp;
-        case OP_REDUCTION:
-            return SaveTmReductionOp;
-        case OP_UPSAMPLE:
-            return SaveTmUpsampleOp;
-        case OP_SHUFFLECHANNEL:
-            return SaveTmShuffleChannelOp;
-        case OP_SPACETOBATCHND:
-            return SaveTmSpaceToBatchNDOp;
-        case OP_BATCHTOSPACEND:
-            return SaveTmBatchToSpaceNDOp;
-        case OP_RESIZE:
-            return SaveTmResizeOp;
-        case OP_CROP:
-            return SaveTmCropOp;
-        case OP_ROIALIGN:
-            return SaveTmRoialignOp;
-        case OP_PSROIPOOLING:
-            return SaveTmPsroipoolingOp;
-        case OP_EXPANDDIMS:
-            return SaveTmExpanddimsOp;
-        case OP_UNARY:
-            return SaveTmUnaryOp;
-        case OP_NOOP:
-            return SaveTmNoopOp;
-        case OP_THRESHOLD:
-            return SaveTmThresholdOp;
-        case OP_HARDSIGMOID:
-            return SaveTmHardsigmoidOp;
-        case OP_EMBEDDING:
-            return SaveTmEmbedOp;
-        case OP_INSTANCENORM:
-            return SaveTmInstanceNormOp;
-        case OP_MVN:
-            return SaveTmMVNOp;
-        case OP_CAST:
-            return SaveTmCastOp;
-        case OP_HARDSWISH:
-            return SaveTmHardSwishOp;
-        case OP_INTERP:
-            return SaveTmInterpOp;
-        case OP_SELU:
-            return SaveTmSeluOp;
-        case OP_ELU:
-            return SaveTmEluOp;
-        case OP_BROADMUL:
-            return SaveTmBroadMulOp;
-        case OP_LOGICAL:
-            return SaveTmLogicalOp;
-        case OP_GATHER:
-            return SaveTmGatherOp;
-        case OP_TRANSPOSE:
-            return SaveTmTransposeOp;
-        case OP_COMPARISON:
-            return SaveTmComparisonOp;
-        case OP_REVERSE:
-            return SaveTmReverseOp;
-        case OP_SPACETODEPTH:
-            return SaveTmSpaceToDepthOp;
-        case OP_DEPTHTOSPACE:
-            return SaveTmDepthToSpaceOp;
-        case OP_SQUAREDDIFFERENCE:
-            return SaveTmSquaredDifferenceOp;
-        case OP_SPARSETODENSE:
-            return SaveTmSparseToDenseOp;
-        case OP_CEIL:
-            return SaveTmCeilOp;
-        case OP_ROUND:
-            return SaveTmRoundOp;
-        case OP_ZEROSLIKE:
-            return SaveTmZerosLikeOp;
-        case OP_CLIP:
-            return SaveTmClipOp;
-        case OP_REDUCEL2:
-            return SaveTmReduceL2Op;
-        case OP_UNSQUEEZE:
-            return SaveTmUnsqueezeOp;
-        case OP_MEAN:
-            return SaveTmMeanOp;
-        case OP_MATMUL:
-            return SaveTmMatMulOp;
-        case OP_MISH:
-            return SaveTmMishOp;
-        default:
-            // fprintf(stderr, "Operator #%d not supported in tengine model yet\n",op_type);
-            return nullptr;
+    case OP_BATCHNORM:
+        return SaveTmBatchNormOp;
+    case OP_CONCAT:
+        return SaveTmConcatOp;
+    case OP_CONST:
+        return SaveTmConstOp;
+    case OP_CONV:
+        return SaveTmConvOp;
+    case OP_DECONV:
+        return SaveTmDeconvOp;
+    case OP_DETECTION_OUTPUT:
+        return SaveTmDetectionOutputOp;
+    case OP_DROPOUT:
+        return SaveTmDropoutOp;
+    case OP_ELTWISE:
+        return SaveTmEltwiseOp;
+    case OP_FLATTEN:
+        return SaveTmFlattenOp;
+    case OP_FC:
+        return SaveTmFCOp;
+    case OP_INPUT:
+        return SaveTmInputOp;
+    case OP_LRN:
+        return SaveTmLRNOp;
+    case OP_NORMALIZE:
+        return SaveTmNormalizeOp;
+    case OP_PERMUTE:
+        return SaveTmPermuteOp;
+    case OP_POOL:
+        return SaveTmPoolingOp;
+    case OP_PRELU:
+        return SaveTmPreluOp;
+    case OP_PRIORBOX:
+        return SaveTmPriorBoxOp;
+    case OP_REGION:
+        return SaveTmRegionOp;
+    case OP_RELU:
+        return SaveTmReLuOp;
+    case OP_RELU6:
+        return SaveTmRelu6Op;
+    case OP_REORG:
+        return SaveTmReorgOp;
+    case OP_RESHAPE:
+        return SaveTmReshapeOp;
+    case OP_ROIPOOLING:
+        return SaveTmROIPoolingOp;
+    case OP_RPN:
+        return SaveTmRPNOp;
+    case OP_SCALE:
+        return SaveTmScaleOp;
+    case OP_SLICE:
+        return SaveTmSliceOp;
+    case OP_SOFTMAX:
+        return SaveTmSoftmaxOp;
+    case OP_SPLIT:
+        return SaveTmSplitOp;
+    case OP_DETECTION_POSTPROCESS:
+        return SaveTmDetectionPostProcessOp;
+    case OP_GEMM:
+        return SaveTmGemmOp;
+    case OP_LOGISTIC:
+        return SaveTmLogisticOp;
+    case OP_LSTM:
+        return SaveTmLstmOp;
+    case OP_RNN:
+        return SaveTmRnnOp;
+    case OP_TANH:
+        return SaveTmTanhOp;
+    case OP_SIGMOID:
+        return SaveTmSigmoidOp;
+    case OP_SQUEEZE:
+        return SaveTmSqueezeOp;
+    case OP_SWAP_AXIS:
+        return SaveTmSwapAxisOp;
+    case OP_GRU:
+        return SaveTmGruOp;
+    case OP_ARGMAX:
+        return SaveTmArgMaxOp;
+    case OP_ARGMIN:
+        return SaveTmArgMinOp;
+    case OP_TOPKV2:
+        return SaveTmTopKV2Op;
+    case OP_PAD:
+        return SaveTmPadOp;
+    case OP_STRIDED_SLICE:
+        return SaveTmStridedSliceOp;
+    case OP_REDUCTION:
+        return SaveTmReductionOp;
+    case OP_UPSAMPLE:
+        return SaveTmUpsampleOp;
+    case OP_SHUFFLECHANNEL:
+        return SaveTmShuffleChannelOp;
+    case OP_SPACETOBATCHND:
+        return SaveTmSpaceToBatchNDOp;
+    case OP_BATCHTOSPACEND:
+        return SaveTmBatchToSpaceNDOp;
+    case OP_RESIZE:
+        return SaveTmResizeOp;
+    case OP_CROP:
+        return SaveTmCropOp;
+    case OP_ROIALIGN:
+        return SaveTmRoialignOp;
+    case OP_PSROIPOOLING:
+        return SaveTmPsroipoolingOp;
+    case OP_EXPANDDIMS:
+        return SaveTmExpanddimsOp;
+    case OP_UNARY:
+        return SaveTmUnaryOp;
+    case OP_NOOP:
+        return SaveTmNoopOp;
+    case OP_THRESHOLD:
+        return SaveTmThresholdOp;
+    case OP_HARDSIGMOID:
+        return SaveTmHardsigmoidOp;
+    case OP_EMBEDDING:
+        return SaveTmEmbedOp;
+    case OP_INSTANCENORM:
+        return SaveTmInstanceNormOp;
+    case OP_MVN:
+        return SaveTmMVNOp;
+    case OP_CAST:
+        return SaveTmCastOp;
+    case OP_HARDSWISH:
+        return SaveTmHardSwishOp;
+    case OP_INTERP:
+        return SaveTmInterpOp;
+    case OP_SELU:
+        return SaveTmSeluOp;
+    case OP_ELU:
+        return SaveTmEluOp;
+    case OP_BROADMUL:
+        return SaveTmBroadMulOp;
+    case OP_LOGICAL:
+        return SaveTmLogicalOp;
+    case OP_GATHER:
+        return SaveTmGatherOp;
+    case OP_TRANSPOSE:
+        return SaveTmTransposeOp;
+    case OP_COMPARISON:
+        return SaveTmComparisonOp;
+    case OP_REVERSE:
+        return SaveTmReverseOp;
+    case OP_SPACETODEPTH:
+        return SaveTmSpaceToDepthOp;
+    case OP_DEPTHTOSPACE:
+        return SaveTmDepthToSpaceOp;
+    case OP_SQUAREDDIFFERENCE:
+        return SaveTmSquaredDifferenceOp;
+    case OP_SPARSETODENSE:
+        return SaveTmSparseToDenseOp;
+    case OP_CEIL:
+        return SaveTmCeilOp;
+    case OP_ROUND:
+        return SaveTmRoundOp;
+    case OP_ZEROSLIKE:
+        return SaveTmZerosLikeOp;
+    case OP_CLIP:
+        return SaveTmClipOp;
+    case OP_REDUCEL2:
+        return SaveTmReduceL2Op;
+    case OP_UNSQUEEZE:
+        return SaveTmUnsqueezeOp;
+    case OP_MEAN:
+        return SaveTmMeanOp;
+    case OP_MATMUL:
+        return SaveTmMatMulOp;
+    case OP_MISH:
+        return SaveTmMishOp;
+    default:
+        // fprintf(stderr, "Operator #%d not supported in tengine model yet\n",op_type);
+        return nullptr;
     }
 }
\ No newline at end of file
diff --git a/tools/quantize/savegraph/tm2_op_save.hpp b/tools/quantize/savegraph/tm2_op_save.hpp
index 78ad4e40e..08d63ec4d 100644
--- a/tools/quantize/savegraph/tm2_op_save.hpp
+++ b/tools/quantize/savegraph/tm2_op_save.hpp
@@ -3,14 +3,13 @@
 
 #include <functional>
 extern "C" {
-    #include "utility/vector.h"
-    #include "tm2_format.h"
-    #include "tm2_generate.h"
-    #include "graph/node.h"
-    
-    #include "op_include.h"
-}
+#include "utility/vector.h"
+#include "tm2_format.h"
+#include "tm2_generate.h"
+#include "graph/node.h"
 
+#include "op_include.h"
+}
 
 using op_save_t = std::function<tm_uoffset_t(void* const, tm_uoffset_t*, struct node*)>;
 op_save_t SaveTmOpFunc(uint32_t op_type);