Merge branch 'master' into kernel_initializer_issue

blue-oil · Jan 28, 2020 · 12cca0c · 12cca0c
2 parents 830feb5 + 825e4b9
commit 12cca0c
Show file tree

Hide file tree

Showing 16 changed files with 161 additions and 127 deletions.
diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py
@@ -500,7 +500,7 @@ def pass_quantize_convolutions(graph: Graph) -> None:
             width = qtz.width
             depth = qtz.channel
             depth_upper = (depth + b - 1) // b
-            qtz.update_shape([height, width, depth_upper, 2, b], "HWChBCl")
+            qtz.update_shape([depth_upper, height, width, 2, b], "ChHWBCl")
 
 
 def pass_propagate_datatypes(graph) -> None:
@@ -530,10 +530,6 @@ def pass_propagate_format(graph) -> None:
                 b = 32
                 shape = [(m.channel + b - 1) // b, m.height, m.width, 2, b]
                 m.update_shape(shape, m.input_nodes[0].dimension)
-            elif m.input_nodes[0].dimension == 'HWChBCl':
-                b = 32
-                shape = [m.height, m.width, (m.channel + b - 1) // b, 2, b]
-                m.update_shape(shape, m.input_nodes[0].dimension)
 
 
 def pass_propagate_output_type_backward(graph: Graph) -> None:

diff --git a/dlk/python/dlk/templates/CMakeLists.txt b/dlk/python/dlk/templates/CMakeLists.txt
@@ -56,7 +56,7 @@ endif()
 
 if(RUN_ON_FPGA)
     list(APPEND SRC_LIB_ALL src/func/arm_neon/batch_normalization.cpp)
-    list(APPEND SRC_LIB_ALL src/func/impl/fpga/quantized_conv2d_kn2row.cpp)
+    list(APPEND SRC_LIB_ALL src/func/impl/fpga/quantized_conv2d_accelerator.cpp)
     list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/pop_count.cpp)
 elseif(USE_NEON)
     list(APPEND SRC_LIB_ALL src/func/arm_neon/batch_normalization.cpp)

diff --git a/dlk/python/dlk/templates/Makefile b/dlk/python/dlk/templates/Makefile
@@ -34,7 +34,7 @@ LIB_ARM_OBJ := $(patsubst %.cpp, %.o, $(LIB_ARM_OBJ))
 
 LIB_FPGA_SRC := $(wildcard $(SRC_DIR)/*.S) \
     $(SRC_DIR)/func/arm_neon/batch_normalization.cpp \
-    $(SRC_DIR)/func/impl/fpga/quantized_conv2d_kn2row.cpp \
+    $(SRC_DIR)/func/impl/fpga/quantized_conv2d_accelerator.cpp \
     $(SRC_DIR)/func/impl/arm_neon/pop_count.cpp
 LIB_FPGA_OBJ := $(patsubst %.S, %.o, $(LIB_FPGA_SRC))
 LIB_FPGA_OBJ := $(patsubst %.cpp, %.o, $(LIB_FPGA_OBJ))

diff --git a/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_accelerator.h b/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_accelerator.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The Blueoil Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef DLK_FUNC_IMPL_QUANTIZED_CONV2D_ACCELERATOR_H_INCLUDED
+#define DLK_FUNC_IMPL_QUANTIZED_CONV2D_ACCELERATOR_H_INCLUDED
+
+#include "types.h"
+#include "operators.h" // FIXME(nikolay): for convolution_parameters definition, rid of it later
+#include "tensor_view.h"
+
+namespace dlk {
+
+namespace impl {
+
+#ifdef RUN_ON_FPGA
+using tca_input_t = TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>;
+using tca_kernel_t = TensorView<QUANTIZED_PACKED_KERNEL, MemoryLayout::OhIhHWOlIl>;
+void TCAConv2d(const tca_input_t& input,
+    const tca_kernel_t& kernel,
+    const binary_convolution_parameters &p);
+#endif
+
+} // namespace impl
+
+} // namespace dlk
+
+#endif
diff --git a/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_kn2row.h b/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_kn2row.h
@@ -16,26 +16,22 @@ limitations under the License.
 #ifndef DLK_FUNC_IMPL_QUANTIZED_CONV2D_KN2ROW_H_INCLUDED
 #define DLK_FUNC_IMPL_QUANTIZED_CONV2D_KN2ROW_H_INCLUDED
 
-#include "global.h"
+#include "types.h"
 #include "operators.h" // FIXME(nikolay): for convolution_parameters definition, rid of it later
 #include "tensor_view.h"
 
 namespace dlk {
 
 namespace impl {
 
-using kn2row_input_elem_t = QUANTIZED_PACKED;
-
 #ifndef RUN_ON_FPGA
 void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::size_t channels);
+using kn2row_input_elem_t = QuantizedPacked<uint32_t>;
 using kn2row_input_t = TensorView<kn2row_input_elem_t, MemoryLayout::HWChBCl>;
+using kn2row_kernel_elem_t = QuantizedPacked<uint32_t>;
+using kn2row_kernel_t = TensorView<kn2row_kernel_elem_t, MemoryLayout::HWOI>;
 void QuantizedConv2DKn2Row(const kn2row_input_t& input,
-                                  const kernel_t& kernel,
-                                  const binary_convolution_parameters &p);
-#else
-using kn2row_input_t = TensorView<kn2row_input_elem_t, MemoryLayout::ChHWBCl>;
-void TCAConv2d(const kn2row_input_t& input,
-    const kernel_t& kernel,
+    const kn2row_kernel_t& kernel,
     const binary_convolution_parameters &p);
 #endif
 

diff --git a/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_tiling.h b/dlk/python/dlk/templates/include/func/impl/quantized_conv2d_tiling.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef DLK_FUNC_IMPL_QUANTIZED_CONV2D_TILING_H_INCLUDED
 #define DLK_FUNC_IMPL_QUANTIZED_CONV2D_TILING_H_INCLUDED
 
-#include "global.h"
+#include "types.h"
 #include "operators.h" // FIXME(nikolay): for binary_convolution_parameters definition, rid of it later
 #include "tensor_view.h"
 
@@ -27,15 +27,18 @@ namespace impl {
 using tiling_input_elem_base_t = uint32_t; // hardcoded, not configurable
 using tiling_input_elem_t = QuantizedPacked<tiling_input_elem_base_t>;
 using tiling_input_t = TensorView<tiling_input_elem_t, MemoryLayout::ChHWBCl>;
+using tiling_kernel_elem_base_t = uint32_t; // hardcoded, not configurable
+using tiling_kernel_elem_t = QuantizedPacked<tiling_kernel_elem_base_t>;
+using tiling_kernel_t = TensorView<tiling_kernel_elem_t, MemoryLayout::OHWI>;
 
 void pack_input_for_tiling(const TensorView<QUANTIZED_NOT_PACKED, MemoryLayout::NHWC>& input,
     const tiling_input_t& output);
 
 void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::size_t channels);
 
 void QuantizedConv2DTiling(const tiling_input_t& input,
-                                  const kernel_t& kernel,
-                                  const binary_convolution_parameters &p);
+    const tiling_kernel_t& kernel,
+    const binary_convolution_parameters &p);
 
 } // namespace impl
 

diff --git a/dlk/python/dlk/templates/include/func/quantized_conv2d.h b/dlk/python/dlk/templates/include/func/quantized_conv2d.h
@@ -20,19 +20,23 @@ limitations under the License.
 #include <memory>
 #include <stdexcept>
 
+#include "global.h"
 #include "tensor_view.h"
 #include "tensor_convert.h"
 #include "operators.h"
 #include "time_measurement.h"
 #include "func/impl/quantized_conv2d_tiling.h"
 #include "func/impl/quantized_conv2d_kn2row.h"
+#include "func/impl/quantized_conv2d_accelerator.h"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
-template <typename T, MemoryLayout layout>
-void QuantizedConv2D(const TensorView<T, layout>& input,
-    const kernel_t& kernel,
+template <typename T_input, MemoryLayout layout_input,
+         typename T_kernel, MemoryLayout layout_kernel>
+void QuantizedConv2D(
+    const TensorView<QuantizedPacked<T_input>, layout_input>& input,
+    const TensorView<QuantizedPacked<T_kernel>, layout_kernel>& kernel,
     binary_convolution_parameters p) {
   Measurement::Start("QuantizedConv2D");
 
@@ -51,14 +55,14 @@ void QuantizedConv2D(const TensorView<T, layout>& input,
   if ((kh == 3 && kw == 3 && padding == 1) ||
       (kh == 1 && kw == 1 && padding == 0)) {
 #ifdef RUN_ON_FPGA
-    dlk::impl::kn2row_input_t::tensor_info_t<std::size_t> shape = {
+    dlk::impl::tca_input_t::tensor_info_t<std::size_t> shape = {
       (ic + QUANTIZED_PACKED::BitCount - 1) / QUANTIZED_PACKED::BitCount,
       ih,
       iw,
       p.bin_input_bitwidth,
       QUANTIZED_PACKED::BitCount
     };
-    dlk::impl::kn2row_input_t tmp(p.device_input_buf, shape);
+    dlk::impl::tca_input_t tmp(p.device_input_buf, shape);
     convert_tensor(input, tmp);
     dlk::impl::TCAConv2d(tmp, kernel, p);
 #elif defined USE_NEON || defined USE_AVX
@@ -91,10 +95,11 @@ void QuantizedConv2D(const TensorView<T, layout>& input,
   Measurement::Stop();
 }
 
-template <typename T, MemoryLayout layout>
+template <typename T_input, MemoryLayout layout_input,
+         typename T_kernel, MemoryLayout layout_kernel>
 void func_QuantizedConv2D(
-    const TensorView<T, layout>& input,
-    const kernel_t& kernel,
+    const TensorView<QuantizedPacked<T_input>, layout_input>& input,
+    const TensorView<QuantizedPacked<T_kernel>, layout_kernel>& kernel,
     const TensorView<T_FLOAT, MemoryLayout::NHWC>& output,
     const T_FLOAT scaling_factor,
     const binary_convolution_parameters& p) {
@@ -130,10 +135,11 @@ void func_QuantizedConv2D(
 
 }
 
-template <typename T, MemoryLayout layout>
+template <typename T_input, MemoryLayout layout_input,
+         typename T_kernel, MemoryLayout layout_kernel>
 void func_QuantizedConv2D(
-    const TensorView<T, layout>& input,
-    const kernel_t& kernel,
+    const TensorView<QuantizedPacked<T_input>, layout_input>& input,
+    const TensorView<QuantizedPacked<T_kernel>, layout_kernel>& kernel,
     const TensorView<T_FLOAT, MemoryLayout::NHWC>& output,
     T_FLOAT scaling_factor[],
     binary_convolution_parameters p) {
@@ -167,10 +173,11 @@ void func_QuantizedConv2D(
   Measurement::Stop();
 }
 
-template<typename T, MemoryLayout layout>
+template <typename T_input, MemoryLayout layout_input,
+         typename T_kernel, MemoryLayout layout_kernel>
 void func_QuantizedConv2DWithThreshold(
-    const TensorView<T, layout>& input,
-    const kernel_t& kernel,
+    const TensorView<QuantizedPacked<T_input>, layout_input>& input,
+    const TensorView<QuantizedPacked<T_kernel>, layout_kernel>& kernel,
     const TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>& output,
     const T_FLOAT scaling_factor,
     const binary_convolution_parameters& p) {
@@ -201,10 +208,11 @@ void func_QuantizedConv2DWithThreshold(
   Measurement::Stop();
 }
 
-template <typename T, MemoryLayout layout>
+template <typename T_input, MemoryLayout layout_input,
+         typename T_kernel, MemoryLayout layout_kernel>
 void func_QuantizedConv2DWithThreshold(
-    const TensorView<T, layout>& input,
-    const kernel_t& kernel,
+    const TensorView<QuantizedPacked<T_input>, layout_input>& input,
+    const TensorView<QuantizedPacked<T_kernel>, layout_kernel>& kernel,
     const TensorView<T_FLOAT, MemoryLayout::NHWC>& output,
     const T_FLOAT scaling_factor,
     const binary_convolution_parameters& p) {
@@ -238,21 +246,23 @@ void func_QuantizedConv2DWithThreshold(
   Measurement::Stop();
 }
 
-template <typename T, MemoryLayout layout>
+template <typename T_input, MemoryLayout layout_input,
+         typename T_kernel, MemoryLayout layout_kernel>
 void func_QuantizedConv2DWithThreshold(
-    const TensorView<T, layout>& input,
-    const kernel_t& kernel,
+    const TensorView<QuantizedPacked<T_input>, layout_input>& input,
+    const TensorView<QuantizedPacked<T_kernel>, layout_kernel>& kernel,
     const TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>& output,
     const T_FLOAT scaling_factor[],
     const binary_convolution_parameters& p) {
   func_QuantizedConv2DWithThreshold(input, kernel, output, scaling_factor[0],
                                     p);
 }
 
-template <typename T, MemoryLayout layout>
+template <typename T_input, MemoryLayout layout_input,
+         typename T_kernel, MemoryLayout layout_kernel>
 void func_QuantizedConv2DWithThreshold(
-    const TensorView<T, layout>& input,
-    const kernel_t& kernel,
+    const TensorView<T_input, layout_input>& input,
+    const TensorView<T_kernel, layout_kernel>& kernel,
     const TensorView<T_FLOAT, MemoryLayout::NHWC>& output,
     T_FLOAT scaling_factor[],
     binary_convolution_parameters p) {

diff --git a/dlk/python/dlk/templates/include/pack_input_to_qwords.h b/dlk/python/dlk/templates/include/pack_input_to_qwords.h
@@ -30,7 +30,7 @@ void pack_input_to_qwords(
   struct binary_convolution_parameters bcp);
 
 
-int pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
+void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
   size_t bits_per_input, QUANTIZED_PACKED output[]);
 
 #endif // DLK_PACK_INPUT_TO_QWORDS_H_INCLUDED
diff --git a/dlk/python/dlk/templates/include/quantizer.h b/dlk/python/dlk/templates/include/quantizer.h
@@ -46,7 +46,7 @@ void func_QTZ_linear_mid_tread_half(
     const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
     const TensorView<T_INT, MemoryLayout::Atom>& nbit,
     const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
-    const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output,
+    const TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>& output,
     BYTE *temporary_buf);
 
 void func_QTZ_linear_mid_tread_half(

diff --git a/dlk/python/dlk/templates/include/tensor_view.h b/dlk/python/dlk/templates/include/tensor_view.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <cassert>
 #include <array>
-#include "global.h"
+#include "types.h"
 
 enum class MemoryLayout {
   Atom, // Scalar object
@@ -244,12 +244,4 @@ class TensorView<QuantizedPacked<T>, memory_layout> {
   tensor_info_t<std::size_t> shape;
 };
 
-#ifdef RUN_ON_FPGA
-using kernel_t = TensorView<QUANTIZED_PACKED_KERNEL, MemoryLayout::OhIhHWOlIl>;
-#elif defined USE_NEON || defined USE_AVX
-using kernel_t = TensorView<QUANTIZED_PACKED_KERNEL, MemoryLayout::OHWI>;
-#else
-using kernel_t = TensorView<QUANTIZED_PACKED_KERNEL, MemoryLayout::HWOI>;
-#endif
-
 #endif
diff --git a/dlk/python/dlk/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp b/dlk/python/dlk/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp
@@ -98,8 +98,8 @@ void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::si
 }
 
 void QuantizedConv2DTiling(const tiling_input_t& input,
-                                  const kernel_t& kernel,
-                                  const binary_convolution_parameters &p) {
+    const tiling_kernel_t& kernel,
+    const binary_convolution_parameters &p) {
   constexpr T_UINT InTypeBitWidth = tiling_input_elem_t::BitCount;
   convolution_parameters cp = p.normal_conv_params;
   const T_UINT out_channels = cp.output_channels;

diff --git a/...unc/impl/fpga/quantized_conv2d_kn2row.cpp → ...mpl/fpga/quantized_conv2d_accelerator.cpp b/...unc/impl/fpga/quantized_conv2d_kn2row.cpp → ...mpl/fpga/quantized_conv2d_accelerator.cpp
@@ -17,7 +17,7 @@ limitations under the License.
 #include <cstdio>
 
 #include "de10_nano.h"
-#include "func/impl/quantized_conv2d_kn2row.h"
+#include "func/impl/quantized_conv2d_accelerator.h"
 #include "global.h"
 #include "network.h"
 #include "pack_input_to_qwords.h"
@@ -35,8 +35,8 @@ namespace dlk
 namespace impl
 {
 
-void TCAConv2d(const kn2row_input_t& input,
-    const kernel_t& kernel,
+void TCAConv2d(const tca_input_t& input,
+    const tca_kernel_t& kernel,
     const binary_convolution_parameters &p) {
 
   using namespace dlk;

diff --git a/dlk/python/dlk/templates/src/func/impl/generic/quantized_conv2d_kn2row.cpp b/dlk/python/dlk/templates/src/func/impl/generic/quantized_conv2d_kn2row.cpp
@@ -36,8 +36,8 @@ void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::si
 }
 
 void QuantizedConv2DKn2Row(const kn2row_input_t& input,
-                                  const kernel_t& kernel,
-                                  const binary_convolution_parameters &p) {
+    const kn2row_kernel_t& kernel,
+    const binary_convolution_parameters &p) {
   using namespace dlk;
 
   T_UINT ic = p.normal_conv_params.kernel_depth;

diff --git a/dlk/python/dlk/templates/src/func/impl/x86_avx/quantized_conv2d_tiling.cpp b/dlk/python/dlk/templates/src/func/impl/x86_avx/quantized_conv2d_tiling.cpp
@@ -127,8 +127,8 @@ void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::si
 }
 
 void QuantizedConv2DTiling(const tiling_input_t& input,
-                                  const kernel_t& kernel,
-                                  const binary_convolution_parameters &p) {
+    const tiling_kernel_t& kernel,
+    const binary_convolution_parameters &p) {
   constexpr std::size_t InTypeBitWidth = tiling_input_elem_t::BitCount;
   convolution_parameters cp = p.normal_conv_params;
   const std::size_t out_channels = cp.output_channels;