Skip to content
This repository has been archived by the owner on Dec 1, 2021. It is now read-only.

Commit

Permalink
Change device buf type (#767)
Browse files Browse the repository at this point in the history
  • Loading branch information
primenumber authored Jan 29, 2020
1 parent 03b718f commit ee960f7
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 35 deletions.
20 changes: 11 additions & 9 deletions dlk/python/dlk/templates/include/func/quantized_conv2d.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void QuantizedConv2D(
p.bin_input_bitwidth,
QUANTIZED_PACKED::BitCount
};
dlk::impl::tca_input_t tmp(p.device_input_buf, shape);
dlk::impl::tca_input_t tmp((QUANTIZED_PACKED*)p.device_input_buf, shape);
convert_tensor(input, tmp);
dlk::impl::TCAConv2d(tmp, kernel, p);
#elif defined USE_NEON || defined USE_AVX
Expand All @@ -73,7 +73,7 @@ void QuantizedConv2D(
p.bin_input_bitwidth,
TilingInTypeBitWidth
};
dlk::impl::tiling_input_t tmp(p.device_input_buf, shape);
dlk::impl::tiling_input_t tmp(reinterpret_cast<dlk::impl::tiling_input_elem_t*>(p.device_input_buf), shape);
convert_tensor(input, tmp);
dlk::impl::QuantizedConv2DTiling(tmp, kernel, p);
#else
Expand All @@ -84,7 +84,7 @@ void QuantizedConv2D(
p.bin_input_bitwidth,
QUANTIZED_PACKED::BitCount
};
dlk::impl::kn2row_input_t tmp(p.device_input_buf, shape);
dlk::impl::kn2row_input_t tmp(reinterpret_cast<QUANTIZED_PACKED*>(p.device_input_buf), shape);
convert_tensor(input, tmp);
dlk::impl::QuantizedConv2DKn2Row(tmp, kernel, p);
#endif
Expand Down Expand Up @@ -121,14 +121,15 @@ void func_QuantizedConv2D(
auto channel_blocks = true_out_channels / b;

size_t area = ncp.output_height * ncp.output_width;
auto out_buf = reinterpret_cast<VOLATILE_IF_FPGA BIN_CONV_OUTPUT*>(p.device_output_buf);
#pragma omp parallel for
for (size_t hw = 0; hw < area; ++hw) {
size_t out_index = hw * true_out_channels;
for (size_t s = 0; s < channel_blocks; ++s)
for (size_t d = 0; d < b; ++d)
output.data()[out_index++] = coeff * p.device_output_buf[hw * b + s * (area * b) + d];
output.data()[out_index++] = coeff * out_buf[hw * b + s * (area * b) + d];
for (size_t d = 0; d < true_out_channels - channel_blocks*b; ++d)
output.data()[out_index++] = coeff * p.device_output_buf[hw * b + channel_blocks * (area * b) + d];
output.data()[out_index++] = coeff * out_buf[hw * b + channel_blocks * (area * b) + d];
}

Measurement::Stop();
Expand Down Expand Up @@ -160,14 +161,15 @@ void func_QuantizedConv2D(
Measurement::Start("QuantizedConv2D_ApplyScalingFactor");

size_t area = ncp.output_height * ncp.output_width;
auto out_buf = reinterpret_cast<VOLATILE_IF_FPGA BIN_CONV_OUTPUT*>(p.device_output_buf);
#pragma omp parallel for
for (size_t hw = 0; hw < area; ++hw) {
size_t out_index = hw * true_out_channels;
for (size_t s = 0; s < channel_blocks; ++s)
for (size_t d = 0; d < b; ++d)
output.data()[out_index++] = (scaling_factor[s*b + d] * post_qtz_factor) * p.device_output_buf[hw * b + s * (area * b) + d];
output.data()[out_index++] = (scaling_factor[s*b + d] * post_qtz_factor) * out_buf[hw * b + s * (area * b) + d];
for (size_t d = 0; d < true_out_channels - channel_blocks*b; ++d)
output.data()[out_index++] = (scaling_factor[channel_blocks*b + d] * post_qtz_factor) * p.device_output_buf[hw * b + channel_blocks * (area * b) + d];
output.data()[out_index++] = (scaling_factor[channel_blocks*b + d] * post_qtz_factor) * out_buf[hw * b + channel_blocks * (area * b) + d];
}

Measurement::Stop();
Expand Down Expand Up @@ -227,14 +229,14 @@ void func_QuantizedConv2DWithThreshold(
const auto out_channels = np.output_channels;
const auto true_out_channels = output.get_shape()[3];

QUANTIZED_PACKED::base_t* ptr = (QUANTIZED_PACKED::base_t*)p.device_output_buf;
auto out_buf = reinterpret_cast<VOLATILE_IF_FPGA QUANTIZED_PACKED::base_t*>(p.device_output_buf);
for (unsigned r = 0; r < out_height; ++r) {
for (unsigned c = 0; c < out_width; ++c) {
for (unsigned d = 0; d < true_out_channels; ++d) {
const auto i = r * out_width * p.n_bit + c * p.n_bit;
QUANTIZED_PACKED::base_t bits = 0;
for (unsigned digit = 0; digit < p.n_bit; ++digit) {
bits |= ((ptr[i + digit] >> d) & 1) << digit;
bits |= ((out_buf[i + digit] >> d) & 1) << digit;
}
T_FLOAT tmp = (T_FLOAT)bits;
tmp = tmp / n;
Expand Down
2 changes: 2 additions & 0 deletions dlk/python/dlk/templates/include/global.tpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ limitations under the License.
#include "types.h"

#if defined RUN_ON_FPGA
#define VOLATILE_IF_FPGA volatile
using QUANTIZED_PACKED = QuantizedPacked<volatile {{ params.default_qword_dtype.cpptype() }}>;
#else
#define VOLATILE_IF_FPGA
using QUANTIZED_PACKED = QuantizedPacked<{{ params.default_qword_dtype.cpptype() }}>;
#endif
using QUANTIZED_PACKED_KERNEL = QuantizedPacked<{{ params.default_qword_dtype.cpptype() }}>;
Expand Down
10 changes: 2 additions & 8 deletions dlk/python/dlk/templates/include/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,8 @@ struct binary_convolution_parameters {
T_UINT bin_input_bitwidth;
T_UINT bin_kernel_ndata;
T_UINT layer_index;
QUANTIZED_PACKED *device_input_buf;
BIN_CONV_OUTPUT *device_output_buf;
void print_device_output_buf(const std::string message) {
std::cout << message << std::endl;
for (int i = 0; i < 4; i++) {
std::cout << device_output_buf[i] << std::endl;
}
}
VOLATILE_IF_FPGA void *device_input_buf;
VOLATILE_IF_FPGA void *device_output_buf;
BIN_CONV_OUTPUT *thresholds;
T_UINT n_bit;
T_FLOAT max_value;
Expand Down
6 changes: 1 addition & 5 deletions dlk/python/dlk/templates/include/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,6 @@ struct Base<QuantizedPacked<T>> {
using type = T;
};

#if defined RUN_ON_FPGA
typedef volatile T_INT16 BIN_CONV_OUTPUT;
#else
typedef T_INT16 BIN_CONV_OUTPUT;
#endif
using BIN_CONV_OUTPUT = T_INT16;

#endif // TYPES_H
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,8 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
+ (row_high + row) * out_width * OutChUnroll2
+ (col_high + col) * OutChUnroll2
+ Om;
vst1q_s16(p.device_output_buf + index + 0, v0);
vst1q_s16(p.device_output_buf + index + 8, v1);
vst1q_s16(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf) + index + 0, v0);
vst1q_s16(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf) + index + 8, v1);
}
}
}
Expand Down Expand Up @@ -588,8 +588,8 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
+ (row_high + row) * out_width * OutChUnroll2
+ (col_high + col) * OutChUnroll2
+ Om;
vst1q_s16(p.device_output_buf + index + 0, v0);
vst1q_s16(p.device_output_buf + index + 8, v1);
vst1q_s16(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf) + index + 0, v0);
vst1q_s16(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf) + index + 8, v1);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,13 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,

Measurement::Start("quantized-kn2row");

auto out_buf = reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf);
auto output_ = MatrixView<BIN_CONV_OUTPUT, MatrixOrder::ColMajor>(
p.device_output_buf, oc, ih * iw);
out_buf, oc, ih * iw);
auto kernel_ = MatrixView<QUANTIZED_PACKED_KERNEL, MatrixOrder::RowMajor>(
kernel.data(), oc * kh * kw, ic / 32);
if (kh == kw && kw == 3) {
std::fill(p.device_output_buf, p.device_output_buf + oc * oh * ow, 0);
std::fill(out_buf, out_buf + oc * oh * ow, 0);
for (std::size_t offset = 0; offset < ih * iw; offset += MAX_SIZE_KN2ROW_COL_BLOCK) {
const auto col_block = std::min(static_cast<std::size_t>(MAX_SIZE_KN2ROW_COL_BLOCK), ih * iw - offset);
auto input_ = MatrixView<QUANTIZED_PACKED, MatrixOrder::ColMajor>(
Expand All @@ -74,7 +75,7 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
auto input_ = MatrixView<QUANTIZED_PACKED, MatrixOrder::ColMajor>(
input.data(), ic / 16, ih * iw);
auto output_ = MatrixView<BIN_CONV_OUTPUT, MatrixOrder::ColMajor>(
p.device_output_buf, oc, ih * iw);
out_buf, oc, ih * iw);
quantized_matrix_multiplication(kernel_, input_, output_);
} else {
std::cerr << "Only 1x1 or 3x3 convolutions are supported." << std::endl;
Expand All @@ -85,7 +86,7 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
if (p.thresholds != nullptr) {
QUANTIZED_PACKED *buf_ptr = reinterpret_cast<QUANTIZED_PACKED*>(temp_buf_ptr);
ApplyThresholds(output_, p);
pack_16bit(p.device_output_buf, buf_ptr, out_size);
pack_16bit(out_buf, buf_ptr, out_size);
const std::size_t b = 32;
TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>::tensor_info_t<std::size_t> buf_shape = {
oh, ow, (oc + b - 1) / b, p.n_bit, b
Expand All @@ -98,20 +99,20 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
p.n_bit,
b
};
TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl> out((QUANTIZED_PACKED*)p.device_output_buf, out_shape);
TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl> out(reinterpret_cast<QUANTIZED_PACKED*>(p.device_output_buf), out_shape);
convert_tensor(buf_tensor, out);
} else {
BIN_CONV_OUTPUT *buf_ptr = reinterpret_cast<BIN_CONV_OUTPUT*>(temp_buf_ptr);
const std::size_t b = 32;
std::copy(p.device_output_buf, p.device_output_buf + out_size, buf_ptr);
std::copy(out_buf, out_buf + out_size, buf_ptr);
TensorView<BIN_CONV_OUTPUT, MemoryLayout::HWC>::tensor_info_t<std::size_t> buf_shape = {
oh, ow, oc
};
TensorView<BIN_CONV_OUTPUT, MemoryLayout::HWC> buf_tensor(buf_ptr, buf_shape);
TensorView<BIN_CONV_OUTPUT, MemoryLayout::ChHWCl>::tensor_info_t<std::size_t> out_shape = {
(oc + b - 1) / b, oh, ow, b
};
TensorView<BIN_CONV_OUTPUT, MemoryLayout::ChHWCl> out(p.device_output_buf, out_shape);
TensorView<BIN_CONV_OUTPUT, MemoryLayout::ChHWCl> out(reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf), out_shape);
convert_tensor(buf_tensor, out);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -299,14 +299,15 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
APPLY_PACK(2);
APPLY_PACK(3);
} else {
auto out_buf = reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf);
#define OUT(i) \
if (col + i >= out_width) continue; \
do { \
const auto out_index = Ohh * out_height * out_width * OutChUnroll2 \
+ row * out_width * OutChUnroll2 \
+ (col + i) * OutChUnroll2 \
+ Om * OutChUnroll; \
_mm256_storeu_si256(reinterpret_cast<__m256i*>(p.device_output_buf + out_index), ans##i); \
_mm256_storeu_si256(reinterpret_cast<__m256i*>(out_buf + out_index), ans##i); \
} while(0)
OUT(0);
OUT(1);
Expand Down Expand Up @@ -512,6 +513,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
}
}
} else {
auto out_buf = reinterpret_cast<BIN_CONV_OUTPUT*>(p.device_output_buf);
for (std::size_t row = 0; row < TileHeight; ++row) {
if (row_high + row >= out_height) break;
for (std::size_t col = 0; col < TileWidth; ++col) {
Expand All @@ -523,7 +525,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
+ (row_high + row) * out_width * OutChUnroll2
+ (col_high + col) * OutChUnroll2
+ Om * OutChUnroll;
_mm_storeu_si128(reinterpret_cast<__m128i*>(p.device_output_buf + index), vec);
_mm_storeu_si128(reinterpret_cast<__m128i*>(out_buf + index), vec);
}
}
}
Expand Down

0 comments on commit ee960f7

Please sign in to comment.