Change device buf type (#767)

blue-oil · Jan 29, 2020 · ee960f7 · ee960f7
1 parent 03b718f
commit ee960f7
Show file tree

Hide file tree

Showing 7 changed files with 32 additions and 35 deletions.
diff --git a/dlk/python/dlk/templates/include/func/quantized_conv2d.h b/dlk/python/dlk/templates/include/func/quantized_conv2d.h
@@ -62,7 +62,7 @@ void QuantizedConv2D(
       p.bin_input_bitwidth,
       QUANTIZED_PACKED::BitCount
     };
-    dlk::impl::tca_input_t tmp(p.device_input_buf, shape);
+    dlk::impl::tca_input_t tmp((QUANTIZED_PACKED*)p.device_input_buf, shape);
     convert_tensor(input, tmp);
     dlk::impl::TCAConv2d(tmp, kernel, p);
 #elif defined USE_NEON || defined USE_AVX
@@ -73,7 +73,7 @@ void QuantizedConv2D(
       p.bin_input_bitwidth,
       TilingInTypeBitWidth
     };
-    dlk::impl::tiling_input_t tmp(p.device_input_buf, shape);
+    dlk::impl::tiling_input_t tmp(reinterpret_cast<dlk::impl::tiling_input_elem_t*>(p.device_input_buf), shape);
     convert_tensor(input, tmp);
     dlk::impl::QuantizedConv2DTiling(tmp, kernel, p);
 #else
@@ -84,7 +84,7 @@ void QuantizedConv2D(
       p.bin_input_bitwidth,
       QUANTIZED_PACKED::BitCount
     };
-    dlk::impl::kn2row_input_t tmp(p.device_input_buf, shape);
+    dlk::impl::kn2row_input_t tmp(reinterpret_cast<QUANTIZED_PACKED*>(p.device_input_buf), shape);
     convert_tensor(input, tmp);
     dlk::impl::QuantizedConv2DKn2Row(tmp, kernel, p);
 #endif
@@ -121,14 +121,15 @@ void func_QuantizedConv2D(
   auto channel_blocks = true_out_channels / b;
 
   size_t area = ncp.output_height * ncp.output_width;
+  auto out_buf = reinterpret_cast<VOLATILE_IF_FPGA BIN_CONV_OUTPUT*>(p.device_output_buf);
 #pragma omp parallel for
   for (size_t hw = 0; hw < area; ++hw) {
     size_t out_index = hw * true_out_channels;
     for (size_t s = 0; s < channel_blocks; ++s)
       for (size_t d = 0; d < b; ++d)
-        output.data()[out_index++] = coeff * p.device_output_buf[hw * b + s * (area * b) + d];
+        output.data()[out_index++] = coeff * out_buf[hw * b + s * (area * b) + d];
     for (size_t d = 0; d < true_out_channels - channel_blocks*b; ++d)
-      output.data()[out_index++] = coeff * p.device_output_buf[hw * b + channel_blocks * (area * b) + d];
+      output.data()[out_index++] = coeff * out_buf[hw * b + channel_blocks * (area * b) + d];
   }
 
   Measurement::Stop();
@@ -160,14 +161,15 @@ void func_QuantizedConv2D(
   Measurement::Start("QuantizedConv2D_ApplyScalingFactor");
 
   size_t area = ncp.output_height * ncp.output_width;
+  auto out_buf = reinterpret_cast<VOLATILE_IF_FPGA BIN_CONV_OUTPUT*>(p.device_output_buf);
 #pragma omp parallel for
   for (size_t hw = 0; hw < area; ++hw) {
     size_t out_index = hw * true_out_channels;
     for (size_t s = 0; s < channel_blocks; ++s)
       for (size_t d = 0; d < b; ++d)
-        output.data()[out_index++] = (scaling_factor[s*b + d] * post_qtz_factor) * p.device_output_buf[hw * b + s * (area * b) + d];
+        output.data()[out_index++] = (scaling_factor[s*b + d] * post_qtz_factor) * out_buf[hw * b + s * (area * b) + d];
     for (size_t d = 0; d < true_out_channels - channel_blocks*b; ++d)
-      output.data()[out_index++] = (scaling_factor[channel_blocks*b + d] * post_qtz_factor) * p.device_output_buf[hw * b + channel_blocks * (area * b) + d];
+      output.data()[out_index++] = (scaling_factor[channel_blocks*b + d] * post_qtz_factor) * out_buf[hw * b + channel_blocks * (area * b) + d];
   }
 
   Measurement::Stop();
@@ -227,14 +229,14 @@ void func_QuantizedConv2DWithThreshold(
   const auto out_channels = np.output_channels;
   const auto true_out_channels = output.get_shape()[3];
 
-  QUANTIZED_PACKED::base_t* ptr = (QUANTIZED_PACKED::base_t*)p.device_output_buf;
+  auto out_buf = reinterpret_cast<VOLATILE_IF_FPGA QUANTIZED_PACKED::base_t*>(p.device_output_buf);
   for (unsigned r = 0; r < out_height; ++r) {
     for (unsigned c = 0; c < out_width; ++c) {
       for (unsigned d = 0; d < true_out_channels; ++d) {
         const auto i = r * out_width * p.n_bit + c * p.n_bit;
         QUANTIZED_PACKED::base_t bits = 0;
         for (unsigned digit = 0; digit < p.n_bit; ++digit) {
-          bits |= ((ptr[i + digit] >> d) & 1) << digit;
+          bits |= ((out_buf[i + digit] >> d) & 1) << digit;
         }
         T_FLOAT tmp = (T_FLOAT)bits;
         tmp = tmp / n;

diff --git a/dlk/python/dlk/templates/include/global.tpl.h b/dlk/python/dlk/templates/include/global.tpl.h
@@ -23,8 +23,10 @@ limitations under the License.
 #include "types.h"
 
 #if defined RUN_ON_FPGA
+#define VOLATILE_IF_FPGA volatile
   using QUANTIZED_PACKED = QuantizedPacked<volatile {{ params.default_qword_dtype.cpptype() }}>;
 #else
+#define VOLATILE_IF_FPGA
   using QUANTIZED_PACKED = QuantizedPacked<{{ params.default_qword_dtype.cpptype() }}>;
 #endif
 using QUANTIZED_PACKED_KERNEL = QuantizedPacked<{{ params.default_qword_dtype.cpptype() }}>;

diff --git a/dlk/python/dlk/templates/include/operators.h b/dlk/python/dlk/templates/include/operators.h
@@ -44,14 +44,8 @@ struct binary_convolution_parameters {
   T_UINT bin_input_bitwidth;
   T_UINT bin_kernel_ndata;
   T_UINT layer_index;
-  QUANTIZED_PACKED *device_input_buf;
-  BIN_CONV_OUTPUT *device_output_buf;
-  void print_device_output_buf(const std::string message) {
-    std::cout << message << std::endl;
-    for (int i = 0; i < 4; i++) {
-      std::cout << device_output_buf[i] << std::endl;
-    }
-  }
+  VOLATILE_IF_FPGA void *device_input_buf;
+  VOLATILE_IF_FPGA void *device_output_buf;
   BIN_CONV_OUTPUT *thresholds;
   T_UINT n_bit;
   T_FLOAT max_value;

diff --git a/dlk/python/dlk/templates/include/types.h b/dlk/python/dlk/templates/include/types.h
@@ -81,10 +81,6 @@ struct Base<QuantizedPacked<T>> {
   using type = T;
 };
 
-#if defined RUN_ON_FPGA
-  typedef volatile T_INT16 BIN_CONV_OUTPUT;
-#else
-  typedef T_INT16 BIN_CONV_OUTPUT;
-#endif
+using BIN_CONV_OUTPUT = T_INT16;
 
 #endif // TYPES_H
diff --git a/dlk/python/dlk/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp b/dlk/python/dlk/templates/src/func/impl/arm_neon/quantized_conv2d_tiling.cpp
@@ -324,8 +324,8 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
                 + (row_high + row) * out_width * OutChUnroll2
                 + (col_high + col) * OutChUnroll2
                 + Om;
-            vst1q_s16(p.device_output_buf + index +  0, v0);
-            vst1q_s16(p.device_output_buf + index +  8, v1);
+            vst1q_s16(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf) + index +  0, v0);
+            vst1q_s16(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf) + index +  8, v1);
           }
         }
       }
@@ -588,8 +588,8 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
                 + (row_high + row) * out_width * OutChUnroll2
                 + (col_high + col) * OutChUnroll2
                 + Om;
-            vst1q_s16(p.device_output_buf + index +  0, v0);
-            vst1q_s16(p.device_output_buf + index +  8, v1);
+            vst1q_s16(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf) + index +  0, v0);
+            vst1q_s16(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf) + index +  8, v1);
           }
         }
       }

diff --git a/dlk/python/dlk/templates/src/func/impl/generic/quantized_conv2d_kn2row.cpp b/dlk/python/dlk/templates/src/func/impl/generic/quantized_conv2d_kn2row.cpp
@@ -54,12 +54,13 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
 
   Measurement::Start("quantized-kn2row");
 
+  auto out_buf = reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf);
   auto output_ = MatrixView<BIN_CONV_OUTPUT, MatrixOrder::ColMajor>(
-      p.device_output_buf, oc, ih * iw);
+      out_buf, oc, ih * iw);
   auto kernel_ = MatrixView<QUANTIZED_PACKED_KERNEL, MatrixOrder::RowMajor>(
       kernel.data(), oc * kh * kw, ic / 32);
   if (kh == kw && kw == 3) {
-    std::fill(p.device_output_buf, p.device_output_buf + oc * oh * ow, 0);
+    std::fill(out_buf, out_buf + oc * oh * ow, 0);
     for (std::size_t offset = 0; offset < ih * iw; offset += MAX_SIZE_KN2ROW_COL_BLOCK) {
       const auto col_block = std::min(static_cast<std::size_t>(MAX_SIZE_KN2ROW_COL_BLOCK), ih * iw - offset);
       auto input_ = MatrixView<QUANTIZED_PACKED, MatrixOrder::ColMajor>(
@@ -74,7 +75,7 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
     auto input_ = MatrixView<QUANTIZED_PACKED, MatrixOrder::ColMajor>(
         input.data(), ic / 16, ih * iw);
     auto output_ = MatrixView<BIN_CONV_OUTPUT, MatrixOrder::ColMajor>(
-        p.device_output_buf, oc, ih * iw);
+        out_buf, oc, ih * iw);
     quantized_matrix_multiplication(kernel_, input_, output_);
   } else {
     std::cerr << "Only 1x1 or 3x3 convolutions are supported." << std::endl;
@@ -85,7 +86,7 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
   if (p.thresholds != nullptr) {
     QUANTIZED_PACKED *buf_ptr = reinterpret_cast<QUANTIZED_PACKED*>(temp_buf_ptr);
     ApplyThresholds(output_, p);
-    pack_16bit(p.device_output_buf, buf_ptr, out_size);
+    pack_16bit(out_buf, buf_ptr, out_size);
     const std::size_t b = 32;
     TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>::tensor_info_t<std::size_t> buf_shape = {
       oh, ow, (oc + b - 1) / b, p.n_bit, b
@@ -98,20 +99,20 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
       p.n_bit,
       b
     };
-    TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl> out((QUANTIZED_PACKED*)p.device_output_buf, out_shape);
+    TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl> out(reinterpret_cast<QUANTIZED_PACKED*>(p.device_output_buf), out_shape);
     convert_tensor(buf_tensor, out);
   } else {
     BIN_CONV_OUTPUT *buf_ptr = reinterpret_cast<BIN_CONV_OUTPUT*>(temp_buf_ptr);
     const std::size_t b = 32;
-    std::copy(p.device_output_buf, p.device_output_buf + out_size, buf_ptr);
+    std::copy(out_buf, out_buf + out_size, buf_ptr);
     TensorView<BIN_CONV_OUTPUT, MemoryLayout::HWC>::tensor_info_t<std::size_t> buf_shape = {
       oh, ow, oc 
     };
     TensorView<BIN_CONV_OUTPUT, MemoryLayout::HWC> buf_tensor(buf_ptr, buf_shape);
     TensorView<BIN_CONV_OUTPUT, MemoryLayout::ChHWCl>::tensor_info_t<std::size_t> out_shape = {
       (oc + b - 1) / b, oh, ow, b
     };
-    TensorView<BIN_CONV_OUTPUT, MemoryLayout::ChHWCl> out(p.device_output_buf, out_shape);
+    TensorView<BIN_CONV_OUTPUT, MemoryLayout::ChHWCl> out(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf), out_shape);
     convert_tensor(buf_tensor, out);
   }
 

diff --git a/dlk/python/dlk/templates/src/func/impl/x86_avx/quantized_conv2d_tiling.cpp b/dlk/python/dlk/templates/src/func/impl/x86_avx/quantized_conv2d_tiling.cpp
@@ -299,14 +299,15 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
           APPLY_PACK(2);
           APPLY_PACK(3);
         } else {
+          auto out_buf = reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf);
 #define OUT(i) \
   if (col + i >= out_width) continue; \
   do { \
     const auto out_index = Ohh * out_height * out_width * OutChUnroll2 \
         + row * out_width * OutChUnroll2 \
         + (col + i) * OutChUnroll2 \
         + Om * OutChUnroll; \
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(p.device_output_buf + out_index), ans##i); \
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(out_buf + out_index), ans##i); \
   } while(0)
           OUT(0);
           OUT(1);
@@ -512,6 +513,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
           }
         }
       } else {
+        auto out_buf = reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf);
         for (std::size_t row = 0; row < TileHeight; ++row) {
           if (row_high + row >= out_height) break;
           for (std::size_t col = 0; col < TileWidth; ++col) {
@@ -523,7 +525,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
                 + (row_high + row) * out_width * OutChUnroll2
                 + (col_high + col) * OutChUnroll2
                 + Om * OutChUnroll;
-            _mm_storeu_si128(reinterpret_cast<__m128i*>(p.device_output_buf + index), vec);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(out_buf + index), vec);
           }
         }
       }