blue-oil · bo-mergebot · Mar 5, 2020 · Dec 17, 2019 · Dec 17, 2019 · Dec 17, 2019
diff --git a/dlk/python/dlk/templates/include/de10_nano.h b/dlk/python/dlk/templates/include/de10_nano.h
@@ -146,9 +146,14 @@ Parameters calcParameters(uint32_t inputHeight, uint32_t inputWidth, uint32_t in
   constexpr uint32_t maxBurst = 32;
   constexpr uint32_t b = 32;
 
-  assert((kernelHeight == 3 && kernelWidth == 3) || (kernelHeight == 1 && kernelWidth == 1));
-
-  uint32_t pad = (kernelHeight == 1) ? 0 : 1;
+  constexpr std::size_t n_bit = 2;
+  constexpr std::size_t maxA = (1 << n_bit) - 1;
+  assert(kernelHeight == kernelWidth); // kernel rectangle must be square
+  assert(kernelHeight % 2 == 1); // kernel size must be odd
+  assert(1 <= kernelHeight && kernelHeight <= 3); // Currently, only 1x1, 3x3 conv are supported
+  assert(inputChannels * kernelHeight * kernelWidth * maxA <= std::numeric_limits<BIN_CONV_OUTPUT>::max()); // overflow check
+
+  uint32_t pad = kernelHeight / 2;
   uint32_t dep = kernelHeight - 1;
 
   auto outputHeight = inputHeight + 2 * pad - dep;
@@ -235,15 +240,8 @@ Parameters calcParameters(uint32_t inputHeight, uint32_t inputWidth, uint32_t in
   p.a2fKernelVCount = kernelHeight;
   p.a2fKernelHCount = kernelWidth;
 
-  if (kernelHeight == 1) {
-    p.a2fTileStep = 1u;
-    p.a2fTileGap = 1u;
-  }
-  else {
-    // TODO: 3x3 stride one assumed here
-    p.a2fTileStep = 1u;
-    p.a2fTileGap = 3u;
-  }
+  p.a2fTileStep = 1u; // stride one assumed
+  p.a2fTileGap = kernelHeight; // stride one assumed
 
   p.a2fOutputHCount = hCount;
   p.a2fOutputWCount = wCount;

diff --git a/dlk/python/dlk/templates/include/func/quantized_conv2d.h b/dlk/python/dlk/templates/include/func/quantized_conv2d.h
@@ -48,49 +48,49 @@ void QuantizedConv2D(
   T_UINT iw = p.normal_conv_params.input_width;
   T_UINT ic = p.normal_conv_params.kernel_depth;
   T_UINT oc = p.normal_conv_params.output_channels;
+  T_UINT maxa = (1 << p.n_bit) - 1;
   auto size = oc * ih * iw;
   if (p.device_output_buf == nullptr)
     p.device_output_buf = new BIN_CONV_OUTPUT[size]();
 
-  if ((kh == 3 && kw == 3 && padding == 1) ||
-      (kh == 1 && kw == 1 && padding == 0)) {
+  assert(kh == kw); // kernel rectangle must be square
+  assert(kh % 2 == 1); // kernel size must be odd
+  assert(1 <= kh && kh <= 5); // Only 1x1, 3x3, 5x5 are supported
+  assert(ic * kh * kw * maxa <= std::numeric_limits<BIN_CONV_OUTPUT>::max()); // overflow check
 #ifdef RUN_ON_FPGA
-    dlk::impl::tca_input_t::tensor_info_t<std::size_t> shape = {
-      (ic + QUANTIZED_PACKED::BitCount - 1) / QUANTIZED_PACKED::BitCount,
-      ih,
-      iw,
-      p.bin_input_bitwidth,
-      QUANTIZED_PACKED::BitCount
-    };
-    dlk::impl::tca_input_t tmp((QUANTIZED_PACKED*)p.device_input_buf, shape);
-    convert_tensor(input, tmp);
-    dlk::impl::TCAConv2d(tmp, kernel, p);
+  dlk::impl::tca_input_t::tensor_info_t<std::size_t> shape = {
+    (ic + QUANTIZED_PACKED::BitCount - 1) / QUANTIZED_PACKED::BitCount,
+    ih,
+    iw,
+    p.bin_input_bitwidth,
+    QUANTIZED_PACKED::BitCount
+  };
+  dlk::impl::tca_input_t tmp((QUANTIZED_PACKED*)p.device_input_buf, shape);
+  convert_tensor(input, tmp);
+  dlk::impl::TCAConv2d(tmp, kernel, p);
 #elif defined USE_NEON || defined USE_AVX
-    dlk::impl::tiling_input_t::tensor_info_t<std::size_t> shape = {
-      ic / TilingInTypeBitWidth,
-      ih,
-      iw,
-      p.bin_input_bitwidth,
-      TilingInTypeBitWidth
-    };
-    dlk::impl::tiling_input_t tmp(reinterpret_cast<dlk::impl::tiling_input_elem_t*>(p.device_input_buf), shape);
-    convert_tensor(input, tmp);
-    dlk::impl::QuantizedConv2DTiling(tmp, kernel, p);
+  dlk::impl::tiling_input_t::tensor_info_t<std::size_t> shape = {
+    ic / TilingInTypeBitWidth,
+    ih,
+    iw,
+    p.bin_input_bitwidth,
+    TilingInTypeBitWidth
+  };
+  dlk::impl::tiling_input_t tmp(reinterpret_cast<dlk::impl::tiling_input_elem_t*>(p.device_input_buf), shape);
+  convert_tensor(input, tmp);
+  dlk::impl::QuantizedConv2DTiling(tmp, kernel, p);
 #else
-    dlk::impl::kn2row_input_t::tensor_info_t<std::size_t> shape = {
-      ih,
-      iw,
-      ic / QUANTIZED_PACKED::BitCount,
-      p.bin_input_bitwidth,
-      QUANTIZED_PACKED::BitCount
-    };
-    dlk::impl::kn2row_input_t tmp(reinterpret_cast<QUANTIZED_PACKED*>(p.device_input_buf), shape);
-    convert_tensor(input, tmp);
-    dlk::impl::QuantizedConv2DKn2Row(tmp, kernel, p);
+  dlk::impl::kn2row_input_t::tensor_info_t<std::size_t> shape = {
+    ih,
+    iw,
+    ic / QUANTIZED_PACKED::BitCount,
+    p.bin_input_bitwidth,
+    QUANTIZED_PACKED::BitCount
+  };
+  dlk::impl::kn2row_input_t tmp(reinterpret_cast<QUANTIZED_PACKED*>(p.device_input_buf), shape);
+  convert_tensor(input, tmp);
+  dlk::impl::QuantizedConv2DKn2Row(tmp, kernel, p);
 #endif
-  } else {
-    throw std::invalid_argument("Unsupported convolution parameter");
-  }
 
   Measurement::Stop();
 }

diff --git a/dlk/python/dlk/templates/include/matrix/shift_add.h b/dlk/python/dlk/templates/include/matrix/shift_add.h
@@ -22,96 +22,37 @@ limitations under the License.
 
 namespace dlk {
 
-inline bool is_first_column(int j, int w) {
-  return (j % w == 0);
-}
-
-inline bool is_last_column(int j, int w) {
-  return (j % w == (w - 1));
-}
-
- // 3x3 matrix
- /* A B C */
- /* D E F */
- /* G H I */
-
-// is the right most column for the kernel matrix?
-inline bool is_cfi(int i, int oc) {
-  return int(i / oc) == 2 or int(i / oc) == 5 or int(i / oc) == 8;
-}
-
-// is the left most column for the kernel matrix?
-inline bool is_adg(int i, int oc) {
-  return int(i / oc) == 0 or int(i / oc) == 3 or int(i / oc) == 6;
-}
-
-// Note: this function is only for 3x3 kernel
-inline int calc_offset(int i, int w) {
-  switch (i) {
-  case 0:
-    return w+1;
-  case 1:
-    return w;
-  case 2:
-    return w-1;
-  case 3:
-    return 1;
-  case 4:
-    return 0;
-  case 5:
-    return -1;
-  case 6:
-    return -w+1;
-  case 7:
-    return -w;
-  case 8:
-    return -w-1;
-  }
-
-  // must not come here
-  assert(false);
-}
-
 template<typename T>
 void matrix_shift_add(MatrixView<T, MatrixOrder::ColMajor>& buf,
                       MatrixView<T, MatrixOrder::ColMajor>& result,
                       const struct convolution_parameters& p,
                       const int block_offset) {
-  Measurement::Start("matrix_shift_add1");
-
-  const int h = p.input_height;
-  const int w = p.input_width;
-  const int oc = p.output_channels;
-  const int kh = p.kernel_height;
-  const int kw = p.kernel_width;
-  const auto col_block = buf.cols();
-
-  // only 3x3 kernel is supported.
-  assert(kh == 3 && kw == 3);
-
-  for (unsigned int j = 0; j < col_block; ++j) {
-    for (unsigned int i = 0; i < buf.rows(); ++i) {
-      if (is_first_column(j + block_offset, w) && is_cfi(i, p.output_channels)) {
-        buf.set(i, j, 0);
-      } else if (is_last_column(j + block_offset, w) && is_adg(i, p.output_channels)) {
-        buf.set(i, j, 0);
-      }
-    }
-  }
+  Measurement::Start("matrix_shift_add");
 
-  Measurement::Stop();
+  const std::ptrdiff_t h = p.input_height;
+  const std::ptrdiff_t w = p.input_width;
+  const std::ptrdiff_t oc = p.output_channels;
+  const std::ptrdiff_t kh = p.kernel_height;
+  const std::ptrdiff_t kw = p.kernel_width;
+  const std::ptrdiff_t col_block = buf.cols();
+  const std::ptrdiff_t pad = p.padding;
 
-  Measurement::Start("matrix_shift_add2");
+  // only 3x3 or 5x5 kernel is supported.
+  assert(kh == kw);
+  assert(kh % 2 == 1);
+  assert(3 <= kh && kh <= 5);
 
   for (int k = 0; k < col_block; ++k) {
     const auto true_k = k + block_offset;
+    const auto row = true_k / w;
+    const auto col = true_k % w;
     for (unsigned int i = 0; i < kh * kw; ++i) {
-      int offset = calc_offset(i, w);
-      if ((true_k + offset < 0) || (true_k + offset >= h * w)) {
-        continue;
-      }
+      int kr = i / kw;
+      int kc = i % kw;
+      if (row - kr + pad < 0 || row - kr + pad >= h || col - kc + pad < 0 || col - kc + pad >= w) continue;
 
-      T* r = result.data(0, true_k + offset);
+      int offset = (kr - pad) * w + (kc - pad);
+      T* r = result.data(0, true_k - offset);
       T* b = buf.data(i*oc, k);
 
       for (unsigned int j = 0; j < oc; ++j) {

diff --git a/dlk/python/dlk/templates/src/func/conv2d.cpp b/dlk/python/dlk/templates/src/func/conv2d.cpp
@@ -29,7 +29,7 @@ limitations under the License.
 namespace {
 
 template<typename T, typename U>
-void conv3x3_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
+void conv_nxn_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
     const TensorView<T, MemoryLayout::HWOI>& kernels,
     const TensorView<U, MemoryLayout::NHWC>& output,
     struct convolution_parameters& p) {
@@ -44,7 +44,8 @@ void conv3x3_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
 
   // assertions
   assert(ih * iw == oh * ow);
-  assert(kh == 3 && kw == 3);
+  assert(kh == kw);
+  assert(3 <= kh && kh <= 5);
 
   // need to initialize output
   std::memset(output.data(), 0, oc * ih * iw * sizeof(U));
@@ -184,7 +185,7 @@ void convolution(
     int kernels_size = p.kernel_height * p.kernel_width * p.kernel_depth * p.output_channels;
     conv1x1_kn2row(input, kernels, output, p);
     return;
-  } else if (p.kernel_height == 3 && p.kernel_width == 3 && p.padding == 1) {
+  } else if (p.kernel_height == p.kernel_width && 3 <= p.kernel_height && p.kernel_height <= 5 && p.padding == p.kernel_height / 2) {
     int kernels_size = p.kernel_height * p.kernel_width * p.kernel_depth * p.output_channels;
     T* buf = reinterpret_cast<T*>(p.temporary_buf);
     using hwoi_t = TensorView<T, MemoryLayout::HWOI>;
@@ -196,7 +197,7 @@ void convolution(
     };
     hwoi_t kernels_hwoi(buf, hwoi_shape);
     ohwi_to_hwoi(kernels, kernels_hwoi, p);
-    conv3x3_kn2row(input, kernels_hwoi, output, p);
+    conv_nxn_kn2row(input, kernels_hwoi, output, p);
     return;
   }
 

diff --git a/dlk/python/dlk/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp b/dlk/python/dlk/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cassert>
 #include <climits>
+#include <limits>
 
 #include "global.h"
 #include "func/impl/quantized_conv2d_tiling.h"
@@ -114,8 +115,10 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
   const T_UINT out_height = cp.output_height;
   const T_UINT out_width = cp.output_width;
   const T_UINT out_size = out_height * out_width * out_channels;
+  const T_UINT maxa = (1 << in_bitwidth) - 1;
 
   assert(kh * kw < 32);
+  assert(in_channels * kh * kw * maxa <= std::numeric_limits<BIN_CONV_OUTPUT>::max());
   assert(in_height * in_width == out_height * out_width);
   assert((in_channels % InTypeBitWidth) == 0);
 

diff --git a/dlk/python/dlk/templates/src/func/impl/generic/quantized_conv2d_kn2row.cpp b/dlk/python/dlk/templates/src/func/impl/generic/quantized_conv2d_kn2row.cpp
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cassert>
 #include <cstring>
 #include <algorithm>
+#include <limits>
 
 #include "global.h"
 #include "func/impl/quantized_conv2d_kn2row.h"
@@ -48,6 +49,7 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
   T_UINT ow = p.normal_conv_params.output_width;
   T_UINT kh = p.normal_conv_params.kernel_height;
   T_UINT kw = p.normal_conv_params.kernel_width;
+  T_UINT maxa = (1 << p.n_bit) - 1;
   BYTE *temp_buf_ptr = p.normal_conv_params.temporary_buf;
 
   assert(ih * iw == oh * ow);
@@ -59,7 +61,13 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
       out_buf, oc, ih * iw);
   auto kernel_ = MatrixView<QUANTIZED_PACKED_KERNEL, MatrixOrder::RowMajor>(
       kernel.data(), oc * kh * kw, ic / 32);
-  if (kh == kw && kw == 3) {
+
+  assert(kh == kw);
+  assert(kh % 2 == 1);
+  assert(1 <= kh && kh <= 5);
+  assert(ic * kh * kw * maxa <= std::numeric_limits<BIN_CONV_OUTPUT>::max());
+
+  if (kh >= 3) {
     std::fill(out_buf, out_buf + oc * oh * ow, 0);
     for (std::size_t offset = 0; offset < ih * iw; offset += MAX_SIZE_KN2ROW_COL_BLOCK) {
       const auto col_block = std::min(static_cast<std::size_t>(MAX_SIZE_KN2ROW_COL_BLOCK), ih * iw - offset);
@@ -71,15 +79,12 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
       quantized_matrix_multiplication(kernel_, input_, buf_);
       matrix_shift_add(buf_, output_, p.normal_conv_params, offset);
     }
-  } else if (kh == kw && kw == 1) {
+  } else {
     auto input_ = MatrixView<QUANTIZED_PACKED, MatrixOrder::ColMajor>(
         input.data(), ic / 16, ih * iw);
     auto output_ = MatrixView<BIN_CONV_OUTPUT, MatrixOrder::ColMajor>(
         out_buf, oc, ih * iw);
     quantized_matrix_multiplication(kernel_, input_, output_);
-  } else {
-    std::cerr << "Only 1x1 or 3x3 convolutions are supported." << std::endl;
-    assert(false);
   }
 
   const auto out_size = oc * oh * ow;

diff --git a/dlk/python/dlk/templates/src/func/impl/x86_avx/quantized_conv2d_tiling.cpp b/dlk/python/dlk/templates/src/func/impl/x86_avx/quantized_conv2d_tiling.cpp
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cassert>
 #include <climits>
+#include <limits>
 
 #include "global.h"
 #include "func/impl/quantized_conv2d_tiling.h"
@@ -143,8 +144,10 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
   const std::size_t out_height = cp.output_height;
   const std::size_t out_width = cp.output_width;
   const std::size_t out_size = out_height * out_width * out_channels;
+  const std::size_t maxa = (1 << in_bitwidth) - 1;
 
-  //assert(kh * kw < 32);
+  assert(kh * kw < 32);
+  assert(in_channels * kh * kw * maxa <= std::numeric_limits<BIN_CONV_OUTPUT>::max());
   assert(in_height * in_width == out_height * out_width);
   assert((in_channels % InTypeBitWidth) == 0);