blue-oil · bo-mergebot · Dec 13, 2019 · Dec 10, 2019 · Dec 10, 2019 · Dec 10, 2019
diff --git a/dlk/python/dlk/code_generater.py b/dlk/python/dlk/code_generater.py
@@ -92,10 +92,8 @@ def generate_thresholds(self):
         src_template_path = path.join('manual', 'consts', 'thresholds.tpl.cpp')
         header_template_path = path.join('manual', 'consts', 'thresholds.tpl.h')
 
-        qconvs_with_ts = [x for x in self.graph.non_variables
-                          if x.op_type == 'Conv'
-                          and cast(Conv, x).is_quantized
-                          and cast(Conv, x).has_thresholds]
+        qconvs_with_ts = [x for x in self.graph.convs(quantized_only=True)
+                          if x.has_thresholds]
 
         self.template.generate(src_template_path,
                                self.src_dir,

diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py
@@ -81,7 +81,7 @@ def run(self):
 
             return self.format_string(
                 f"""
-                func_QTZ_linear_mid_tread_half({inputs_string}, {op.name});
+                func_QTZ_linear_mid_tread_half({inputs_string}, {op.name}, quantize_tmp_buffer.get());
                 """
             )
 
@@ -119,7 +119,7 @@ def run(self):
                 inputs_string = self.inputs_to_string(op, input_ops)
 
                 if op.has_thresholds:
-                    threshold = f'{op.name}_thresholds'
+                    threshold = f'{op.name}_thresholds_converted.get()'
                     thresholds_addr = f'THRESHOLD_ADDR + {op.name}_thresholds_offset'
                     conv_func = 'func_QuantizedConv2DWithThreshold'
                     nbit_aqtz = self.op.a_quantizer[0].nbit
@@ -146,6 +146,7 @@ def run(self):
                     Conv2D_struct.padding = {pad};
                     Conv2D_struct.stride_along_height = {stride};
                     Conv2D_struct.stride_along_width = {stride};
+                    Conv2D_struct.temporary_buf = qconv_tmp_buffer.get();
 
                     binConv2D_struct.normal_conv_params = Conv2D_struct;
                     binConv2D_struct.bin_input_extra_bits = 0;
@@ -192,6 +193,7 @@ def run(self):
                     Conv2D_struct.padding = {pad};
                     Conv2D_struct.stride_along_height = {stride};
                     Conv2D_struct.stride_along_width = {stride};
+                    Conv2D_struct.temporary_buf = conv_tmp_buffer.get();
 
                     func_Conv2D({inputs_string}, {op.name}, Conv2D_struct);
                     """

diff --git a/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_kn2row.h b/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_kn2row.h
@@ -27,6 +27,7 @@ namespace impl {
 using kn2row_input_elem_t = QUANTIZED_PACKED;
 
 #ifndef RUN_ON_FPGA
+void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::size_t channels);
 using kn2row_input_t = TensorView<kn2row_input_elem_t, MemoryLayout::HWChBCl>;
 void QuantizedConv2DKn2Row(const kn2row_input_t& input,
                                   const kernel_t& kernel,

diff --git a/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_tiling.h b/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_tiling.h
@@ -31,6 +31,8 @@ using tiling_input_t = TensorView<tiling_input_elem_t, MemoryLayout::ChHWBCl>;
 void pack_input_for_tiling(const TensorView<QUANTIZED_NOT_PACKED, MemoryLayout::NHWC>& input,
     const tiling_input_t& output);
 
+void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::size_t channels);
+
 void QuantizedConv2DTiling(const tiling_input_t& input,
                                   const kernel_t& kernel,
                                   const binary_convolution_parameters &p);

diff --git a/dlk/python/dlk/templates/include/global.tpl.h b/dlk/python/dlk/templates/include/global.tpl.h
@@ -23,6 +23,14 @@ limitations under the License.
 #include <type_traits>
 #include "func/impl/pop_count.h"
 
+#ifdef __cpp_lib_byte
+#include <cstddef>
+using BYTE = std::byte;
+#else
+enum class byte : unsigned char {};
+using BYTE = byte;
+#endif
+
 typedef uint32_t T_UINT;
 typedef int32_t  T_INT;
 typedef float    T_FLOAT;

diff --git a/dlk/python/dlk/templates/include/matrix/multiplication.h b/dlk/python/dlk/templates/include/matrix/multiplication.h
@@ -16,22 +16,27 @@ limitations under the License.
 #ifndef DLK_MATRIX_MULTIPLICATION_H_INCLUDED
 #define DLK_MATRIX_MULTIPLICATION_H_INCLUDED
 
+#include "global.h"
 #include "matrix_view.h"
 #include "time_measurement.h"
 
 namespace dlk {
 
 namespace details {
 
+constexpr std::size_t MAX_UNROLL = 16; // hard coded, not configurable
+
 void matrix_multiplication_col3(
   MatrixView<float, MatrixOrder::RowMajor>& A,
   MatrixView<float, MatrixOrder::ColMajor>& B,
-  MatrixView<float, MatrixOrder::ColMajor>& C);
+  MatrixView<float, MatrixOrder::ColMajor>& C,
+  BYTE *temporary_buf);
 
 void matrix_multiplication_impl(
-   MatrixView<float, MatrixOrder::RowMajor>& A,
-   MatrixView<float, MatrixOrder::ColMajor>& B,
-   MatrixView<float, MatrixOrder::ColMajor>& C);
+  MatrixView<float, MatrixOrder::RowMajor>& A,
+  MatrixView<float, MatrixOrder::ColMajor>& B,
+  MatrixView<float, MatrixOrder::ColMajor>& C,
+  BYTE *temporary_buf);
 
 } // namespace details
 
@@ -40,21 +45,22 @@ template<typename T, typename U, typename V>
 void matrix_multiplication(
    MatrixView<T, MatrixOrder::RowMajor>& A,
    MatrixView<U, MatrixOrder::ColMajor>& B,
-   MatrixView<V, MatrixOrder::ColMajor>& C) {
+   MatrixView<V, MatrixOrder::ColMajor>& C,
+   BYTE *temporary_buf) {
 
   assert(A.cols() == B.rows());
   Measurement::Start("matrix_multiplication");
 
 #ifdef USE_NEON
   if (A.cols() == 3 && A.rows() % 4 == 0) {
-    details::matrix_multiplication_col3(A, B, C);
+    details::matrix_multiplication_col3(A, B, C, temporary_buf);
   } else {
-    details::matrix_multiplication_impl(A, B, C);
+    details::matrix_multiplication_impl(A, B, C, temporary_buf);
   }
   Measurement::Stop();
   return;
 #elif defined USE_AVX
-  details::matrix_multiplication_impl(A, B, C);
+  details::matrix_multiplication_impl(A, B, C, temporary_buf);
   Measurement::Stop();
   return;
 #endif

diff --git a/dlk/python/dlk/templates/include/network.tpl.h b/dlk/python/dlk/templates/include/network.tpl.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef NETWORK_H_INCLUDED
 #define NETWORK_H_INCLUDED
 
+#include <memory>
 #include "global.h"
 #include "dma_buffer.h"
 
@@ -60,6 +61,10 @@ class SYM_PUBLIC Network
     QUANTIZED_PACKED *device_input_buf = 0;
     BIN_CONV_OUTPUT *device_output_buf = 0;
 
+    std::unique_ptr<BYTE[]> qconv_tmp_buffer;
+    std::unique_ptr<BYTE[]> conv_tmp_buffer;
+    std::unique_ptr<BYTE[]> quantize_tmp_buffer;
+
     const T_INT input_rank = {{ graph_input.rank }};
     const T_INT input_shape[{{ graph_input.rank }}] = { {{ graph_input.view.shape_as_cpp }} };
 
@@ -98,6 +103,15 @@ class SYM_PUBLIC Network
   {% endfor -%}
   const uint32_t total_thresholds_size = std::max(1, {{th_offset.o}});
 #endif // RUN_ON_FPGA
+  {% for qconv in graph.convs(quantized_only=True) -%}
+  {%     if qconv.has_thresholds -%}
+  {%         set b = 32 -%}
+  {%         set channels_padded = qconv.channel + (b - qconv.channel % b) % b -%}
+  const std::unique_ptr<BIN_CONV_OUTPUT[]> {{qconv.name}}_thresholds_converted = std::make_unique<BIN_CONV_OUTPUT[]>({{channels_padded}} * NUM_OF_A2W1_THRESHOLD);
+  {%     else -%}
+  const std::unique_ptr<BIN_CONV_OUTPUT[]> {{qconv.name}}_thresholds_converted;
+  {%     endif -%}
+  {% endfor -%}
 };
 
 #endif // NETWORK_H_INCLUDED

diff --git a/dlk/python/dlk/templates/include/operators.h b/dlk/python/dlk/templates/include/operators.h
@@ -33,6 +33,7 @@ struct convolution_parameters {
   T_UINT stride_along_height;
   T_UINT stride_along_width;
   T_UINT padding;
+  BYTE *temporary_buf;
 };
 
 struct binary_convolution_parameters {

diff --git a/dlk/python/dlk/templates/include/quantizer.h b/dlk/python/dlk/templates/include/quantizer.h
@@ -46,12 +46,14 @@ void func_QTZ_linear_mid_tread_half(
     const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
     const TensorView<T_INT, MemoryLayout::Atom>& nbit,
     const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
-    const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output);
+    const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output,
+    BYTE *temporary_buf);
 
 void func_QTZ_linear_mid_tread_half(
   const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
   const TensorView<T_INT, MemoryLayout::Atom>& nbit,
   const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
-  const TensorView<T_FLOAT, MemoryLayout::NHWC>& output);
+  const TensorView<T_FLOAT, MemoryLayout::NHWC>& output,
+  BYTE *temporary_buf);
 
 #endif // QUANTIZER_H_INCLUDED
diff --git a/dlk/python/dlk/templates/src/func/conv2d.cpp b/dlk/python/dlk/templates/src/func/conv2d.cpp
@@ -55,7 +55,8 @@ void conv3x3_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
   assert(p.input_height > 0);
   assert(p.input_width > 0);
 
-  static U buf[MAX_SIZE_KN2ROW_BUFFER_PER_LAYER];
+  T* buf = reinterpret_cast<T*>(p.temporary_buf) + MAX_SIZE_KERNELS_PER_LAYER; // offset comes from kernel layout convert buffer
+  BYTE* matmul_buf = reinterpret_cast<BYTE*>(buf + MAX_SIZE_KN2ROW_BUFFER_PER_LAYER);
 
   Measurement::Stop();
 
@@ -66,7 +67,7 @@ void conv3x3_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
     auto input_ = dlk::MatrixView<T, dlk::MatrixOrder::ColMajor>(input.data() + ic * offset, ic, col_block);
     auto buf_ = dlk::MatrixView<U, dlk::MatrixOrder::ColMajor>(buf, oc * kh * kw, col_block);
 
-    dlk::matrix_multiplication(kernels_, input_, buf_);
+    dlk::matrix_multiplication(kernels_, input_, buf_, matmul_buf);
     dlk::matrix_shift_add(buf_, output_, p, offset);
   }
 
@@ -108,16 +109,18 @@ void conv1x1_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
   Measurement::Start("kn2row-1x1");
 
 
-   assert(p.input_height > 0);
-   assert(p.input_width > 0);
+  assert(p.input_height > 0);
+  assert(p.input_width > 0);
 
-   auto kernels_ = dlk::MatrixView<T, dlk::MatrixOrder::RowMajor>(kernels.data(), oc * kh * kw, ic);
-   auto input_ = dlk::MatrixView<T, dlk::MatrixOrder::ColMajor>(input.data(), ic, p.input_height * p.input_width);
-   auto output_ = dlk::MatrixView<U, dlk::MatrixOrder::ColMajor>(output.data(), oc, p.input_height * p.input_width);
+  auto kernels_ = dlk::MatrixView<T, dlk::MatrixOrder::RowMajor>(kernels.data(), oc * kh * kw, ic);
+  auto input_ = dlk::MatrixView<T, dlk::MatrixOrder::ColMajor>(input.data(), ic, p.input_height * p.input_width);
+  auto output_ = dlk::MatrixView<U, dlk::MatrixOrder::ColMajor>(output.data(), oc, p.input_height * p.input_width);
 
-   dlk::matrix_multiplication(kernels_, input_, output_);
+  // offset comes from kernel layout convert buffer
+  BYTE* matmul_buf = p.temporary_buf + MAX_SIZE_KERNELS_PER_LAYER * sizeof(T);
+  dlk::matrix_multiplication(kernels_, input_, output_, matmul_buf);
 
-   Measurement::Stop();
+  Measurement::Stop();
 }
 
 template<typename T>
@@ -183,15 +186,15 @@ void convolution(
     return;
   } else if (p.kernel_height == 3 && p.kernel_width == 3 && p.padding == 1) {
     int kernels_size = p.kernel_height * p.kernel_width * p.kernel_depth * p.output_channels;
-    const auto kernels_hwoi_buf = std::make_unique<T[]>(kernels_size);
+    T* buf = reinterpret_cast<T*>(p.temporary_buf);
     using hwoi_t = TensorView<T, MemoryLayout::HWOI>;
     typename hwoi_t::template tensor_info_t<std::size_t> hwoi_shape = {
       p.kernel_height,
       p.kernel_width,
       p.output_channels,
       p.kernel_depth
     };
-    hwoi_t kernels_hwoi(kernels_hwoi_buf.get(), hwoi_shape);
+    hwoi_t kernels_hwoi(buf, hwoi_shape);
     ohwi_to_hwoi(kernels, kernels_hwoi, p);
     conv3x3_kn2row(input, kernels_hwoi, output, p);
     return;

diff --git a/dlk/python/dlk/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp b/dlk/python/dlk/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp
@@ -30,8 +30,6 @@ namespace dlk {
 
 namespace impl {
 
-static auto buf_th = std::make_unique<BIN_CONV_OUTPUT[]>(NUM_OF_A2W1_THRESHOLD * MAX_IN_C);
-
 void pack_input_for_tiling(const TensorView<QUANTIZED_NOT_PACKED, MemoryLayout::NHWC>& input,
     const tiling_input_t& output) {
   Measurement::Start("Pack_input_for_tiling");
@@ -72,6 +70,33 @@ void pack_input_for_tiling(const TensorView<QUANTIZED_NOT_PACKED, MemoryLayout::
   Measurement::Stop();
 }
 
+void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::size_t channels) {
+  std::size_t i = 0;
+  for (; i + 8 <= channels; i += 8) {
+    const auto v = vld4q_s16(input + NUM_OF_A2W1_THRESHOLD * i);
+    const auto is_neg = vreinterpretq_s16_u16(vmvnq_u16(vcgeq_s16(v.val[3], vdupq_n_s16(0))));
+    int16x8x4_t res;
+    res.val[0] = vsubq_s16(v.val[0], is_neg);
+    res.val[1] = vsubq_s16(v.val[1], is_neg);
+    res.val[2] = vsubq_s16(v.val[2], is_neg);
+    res.val[3] = v.val[3];
+    vst4q_s16(output + NUM_OF_A2W1_THRESHOLD * i, res);
+  }
+  for (; i < channels; ++i) {
+    BIN_CONV_OUTPUT v0 = input[NUM_OF_A2W1_THRESHOLD * i + 0];
+    BIN_CONV_OUTPUT v1 = input[NUM_OF_A2W1_THRESHOLD * i + 1];
+    BIN_CONV_OUTPUT v2 = input[NUM_OF_A2W1_THRESHOLD * i + 2];
+    const BIN_CONV_OUTPUT flg = input[NUM_OF_A2W1_THRESHOLD * i + 3];
+    if (flg < 0) {
+      --v0; --v1; --v2;
+    }
+    output[NUM_OF_A2W1_THRESHOLD * i + 0] = v0;
+    output[NUM_OF_A2W1_THRESHOLD * i + 1] = v1;
+    output[NUM_OF_A2W1_THRESHOLD * i + 2] = v2;
+    output[NUM_OF_A2W1_THRESHOLD * i + 3] = flg;
+  }
+}
+
 void QuantizedConv2DTiling(const tiling_input_t& input,
                                   const kernel_t& kernel,
                                   const binary_convolution_parameters &p) {
@@ -95,18 +120,6 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
   assert((in_channels % InTypeBitWidth) == 0);
 
   Measurement::Start("Quantized Conv2D Tiling");
-  if (p.thresholds != nullptr) {
-    for (T_UINT i = 0; i < out_channels; i += 8) {
-      const auto v = vld4q_s16(p.thresholds + NUM_OF_A2W1_THRESHOLD * i);
-      const auto is_neg = vreinterpretq_s16_u16(vmvnq_u16(vcgeq_s16(v.val[3], vdupq_n_s16(0))));
-      int16x8x4_t res;
-      res.val[0] = vsubq_s16(v.val[0], is_neg);
-      res.val[1] = vsubq_s16(v.val[1], is_neg);
-      res.val[2] = vsubq_s16(v.val[2], is_neg);
-      res.val[3] = v.val[3];
-      vst4q_s16(buf_th.get() + NUM_OF_A2W1_THRESHOLD * i, res);
-    }
-  }
   constexpr uint8_t coeff_ary[16] = {
     0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
     0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
@@ -264,7 +277,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
       }
       if (p.thresholds != nullptr) {
 #define LOAD_TH(k) \
-  const auto ts##k = vld4q_s16(buf_th.get() + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \
+  const auto ts##k = vld4q_s16(p.thresholds + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \
   const auto is_neg##k = vreinterpretq_s16_u16(vcltq_s16(ts##k.val[3], vdupq_n_s16(0))); \
   const auto m2_##k = vsubq_s16(ts##k.val[3], vdupq_n_s16(2)); \
   const auto is_const##k = vcgeq_s16(m2_##k, vdupq_n_s16(0));
@@ -529,7 +542,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
       }
       if (p.thresholds != nullptr) {
 #define LOAD_TH(k) \
-  const auto ts##k = vld4q_s16(buf_th.get() + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \
+  const auto ts##k = vld4q_s16(p.thresholds + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \
   const auto is_neg##k = vreinterpretq_s16_u16(vcltq_s16(ts##k.val[3], vdupq_n_s16(0))); \
   const auto m2_##k = vsubq_s16(ts##k.val[3], vdupq_n_s16(2)); \
   const auto is_const##k = vcgeq_s16(m2_##k, vdupq_n_s16(0));