From ea6ee1520b11e89c41c076ecacfa64bfacac80d6 Mon Sep 17 00:00:00 2001 From: primenumber Date: Thu, 12 Mar 2020 18:48:34 +0900 Subject: [PATCH 1/2] Fix threshold skipping if flg = -1 --- .../impl/arm_neon/quantized_conv2d_tiling.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/blueoil/converter/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp b/blueoil/converter/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp index 4a7e2608f..aae7ae76c 100644 --- a/blueoil/converter/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp +++ b/blueoil/converter/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp @@ -281,7 +281,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input, if (p.thresholds != nullptr) { #define LOAD_TH(k) \ const auto ts##k = vld4q_s16(p.thresholds + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \ - const auto is_neg##k = vreinterpretq_s16_u16(vcltq_s16(ts##k.val[3], vdupq_n_s16(0))); \ + const auto mask##k = vreinterpretq_s16_u16(0x0003 & vcltq_s16(ts##k.val[3], vdupq_n_s16(0))); \ const auto m2_##k = vsubq_s16(ts##k.val[3], vdupq_n_s16(2)); \ const auto is_const##k = vcgeq_s16(m2_##k, vdupq_n_s16(0)); LOAD_TH(0) @@ -292,11 +292,11 @@ void QuantizedConv2DTiling(const tiling_input_t& input, if (col_high + col >= out_width) break; #define APPLY(k) \ const auto d##k = vld1q_s16(out_tile + buf_index + 8 * k); \ - const auto f##k##0 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[0])) & ts##k.val[3]; \ - const auto f##k##1 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[1])) & ts##k.val[3]; \ - const auto f##k##2 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[2])) & ts##k.val[3]; \ - const auto tmp##k = f##k##0 + f##k##1 + f##k##2 + is_neg##k; \ - const auto res##k = vreinterpretq_u8_s16(vbslq_s16(is_const##k, m2_##k, tmp##k)); + const auto f##k##0 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[0]) & 0x0001u); \ + const auto f##k##1 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[1]) & 0x0001u); \ + const auto f##k##2 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[2]) & 0x0001u); \ + const auto tmp##k = f##k##0 + f##k##1 + f##k##2; \ + const auto res##k = vreinterpretq_u8_s16(vbslq_s16(is_const##k, m2_##k, mask##k ^ tmp##k)); const auto buf_index = row * TileWidth * OutChUnroll + col * OutChUnroll; APPLY(0) @@ -546,7 +546,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input, if (p.thresholds != nullptr) { #define LOAD_TH(k) \ const auto ts##k = vld4q_s16(p.thresholds + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \ - const auto is_neg##k = vreinterpretq_s16_u16(vcltq_s16(ts##k.val[3], vdupq_n_s16(0))); \ + const auto mask##k = vreinterpretq_s16_u16(0x0003 & vcltq_s16(ts##k.val[3], vdupq_n_s16(0))); \ const auto m2_##k = vsubq_s16(ts##k.val[3], vdupq_n_s16(2)); \ const auto is_const##k = vcgeq_s16(m2_##k, vdupq_n_s16(0)); LOAD_TH(0) @@ -557,11 +557,11 @@ void QuantizedConv2DTiling(const tiling_input_t& input, if (col_high + col >= out_width) break; #define APPLY(k) \ const auto d##k = vld1q_s16(out_tile + buf_index + 8 * k); \ - const auto f##k##0 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[0])) & ts##k.val[3]; \ - const auto f##k##1 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[1])) & ts##k.val[3]; \ - const auto f##k##2 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[2])) & ts##k.val[3]; \ - const auto tmp##k = f##k##0 + f##k##1 + f##k##2 + is_neg##k; \ - const auto res##k = vreinterpretq_u8_s16(vbslq_s16(is_const##k, m2_##k, tmp##k)); + const auto f##k##0 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[0]) & 0x0001u); \ + const auto f##k##1 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[1]) & 0x0001u); \ + const auto f##k##2 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[2]) & 0x0001u); \ + const auto tmp##k = f##k##0 + f##k##1 + f##k##2; \ + const auto res##k = vreinterpretq_u8_s16(vbslq_s16(is_const##k, m2_##k, mask##k ^ tmp##k)); const auto buf_index = row * TileWidth * OutChUnroll + col * OutChUnroll; APPLY(0) From dec308a8cde172fa077f5d656309efa72b6a068c Mon Sep 17 00:00:00 2001 From: primenumber Date: Thu, 12 Mar 2020 19:38:32 +0900 Subject: [PATCH 2/2] Fix greater-than to greater-or-equal --- .../func/impl/arm_neon/quantized_conv2d_tiling.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/blueoil/converter/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp b/blueoil/converter/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp index aae7ae76c..b651a50df 100644 --- a/blueoil/converter/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp +++ b/blueoil/converter/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp @@ -292,9 +292,9 @@ void QuantizedConv2DTiling(const tiling_input_t& input, if (col_high + col >= out_width) break; #define APPLY(k) \ const auto d##k = vld1q_s16(out_tile + buf_index + 8 * k); \ - const auto f##k##0 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[0]) & 0x0001u); \ - const auto f##k##1 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[1]) & 0x0001u); \ - const auto f##k##2 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[2]) & 0x0001u); \ + const auto f##k##0 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[0]) & 0x0001u); \ + const auto f##k##1 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[1]) & 0x0001u); \ + const auto f##k##2 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[2]) & 0x0001u); \ const auto tmp##k = f##k##0 + f##k##1 + f##k##2; \ const auto res##k = vreinterpretq_u8_s16(vbslq_s16(is_const##k, m2_##k, mask##k ^ tmp##k)); const auto buf_index = row * TileWidth * OutChUnroll @@ -557,9 +557,9 @@ void QuantizedConv2DTiling(const tiling_input_t& input, if (col_high + col >= out_width) break; #define APPLY(k) \ const auto d##k = vld1q_s16(out_tile + buf_index + 8 * k); \ - const auto f##k##0 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[0]) & 0x0001u); \ - const auto f##k##1 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[1]) & 0x0001u); \ - const auto f##k##2 = vreinterpretq_s16_u16(vcgtq_s16(d##k, ts##k.val[2]) & 0x0001u); \ + const auto f##k##0 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[0]) & 0x0001u); \ + const auto f##k##1 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[1]) & 0x0001u); \ + const auto f##k##2 = vreinterpretq_s16_u16(vcgeq_s16(d##k, ts##k.val[2]) & 0x0001u); \ const auto tmp##k = f##k##0 + f##k##1 + f##k##2; \ const auto res##k = vreinterpretq_u8_s16(vbslq_s16(is_const##k, m2_##k, mask##k ^ tmp##k)); const auto buf_index = row * TileWidth * OutChUnroll