Skip to content
This repository has been archived by the owner on Dec 1, 2021. It is now read-only.

Allocate temporary buffers in Network::init #664

Merged
merged 16 commits into from
Dec 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions dlk/python/dlk/code_generater.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,8 @@ def generate_thresholds(self):
src_template_path = path.join('manual', 'consts', 'thresholds.tpl.cpp')
header_template_path = path.join('manual', 'consts', 'thresholds.tpl.h')

qconvs_with_ts = [x for x in self.graph.non_variables
if x.op_type == 'Conv'
and cast(Conv, x).is_quantized
and cast(Conv, x).has_thresholds]
qconvs_with_ts = [x for x in self.graph.convs(quantized_only=True)
if x.has_thresholds]

self.template.generate(src_template_path,
self.src_dir,
Expand Down
6 changes: 4 additions & 2 deletions dlk/python/dlk/core/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def run(self):

return self.format_string(
f"""
func_QTZ_linear_mid_tread_half({inputs_string}, {op.name});
func_QTZ_linear_mid_tread_half({inputs_string}, {op.name}, quantize_tmp_buffer.get());
"""
)

Expand Down Expand Up @@ -119,7 +119,7 @@ def run(self):
inputs_string = self.inputs_to_string(op, input_ops)

if op.has_thresholds:
threshold = f'{op.name}_thresholds'
threshold = f'{op.name}_thresholds_converted.get()'
thresholds_addr = f'THRESHOLD_ADDR + {op.name}_thresholds_offset'
conv_func = 'func_QuantizedConv2DWithThreshold'
nbit_aqtz = self.op.a_quantizer[0].nbit
Expand All @@ -146,6 +146,7 @@ def run(self):
Conv2D_struct.padding = {pad};
Conv2D_struct.stride_along_height = {stride};
Conv2D_struct.stride_along_width = {stride};
Conv2D_struct.temporary_buf = qconv_tmp_buffer.get();

binConv2D_struct.normal_conv_params = Conv2D_struct;
binConv2D_struct.bin_input_extra_bits = 0;
Expand Down Expand Up @@ -192,6 +193,7 @@ def run(self):
Conv2D_struct.padding = {pad};
Conv2D_struct.stride_along_height = {stride};
Conv2D_struct.stride_along_width = {stride};
Conv2D_struct.temporary_buf = conv_tmp_buffer.get();

func_Conv2D({inputs_string}, {op.name}, Conv2D_struct);
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ namespace impl {
using kn2row_input_elem_t = QUANTIZED_PACKED;

#ifndef RUN_ON_FPGA
void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::size_t channels);
using kn2row_input_t = TensorView<kn2row_input_elem_t, MemoryLayout::HWChBCl>;
void QuantizedConv2DKn2Row(const kn2row_input_t& input,
const kernel_t& kernel,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ using tiling_input_t = TensorView<tiling_input_elem_t, MemoryLayout::ChHWBCl>;
void pack_input_for_tiling(const TensorView<QUANTIZED_NOT_PACKED, MemoryLayout::NHWC>& input,
const tiling_input_t& output);

void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::size_t channels);

void QuantizedConv2DTiling(const tiling_input_t& input,
const kernel_t& kernel,
const binary_convolution_parameters &p);
Expand Down
8 changes: 8 additions & 0 deletions dlk/python/dlk/templates/include/global.tpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ limitations under the License.
#include <type_traits>
#include "func/impl/pop_count.h"

#ifdef __cpp_lib_byte
#include <cstddef>
using BYTE = std::byte;
#else
enum class byte : unsigned char {};
using BYTE = byte;
#endif

typedef uint32_t T_UINT;
typedef int32_t T_INT;
typedef float T_FLOAT;
Expand Down
22 changes: 14 additions & 8 deletions dlk/python/dlk/templates/include/matrix/multiplication.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,27 @@ limitations under the License.
#ifndef DLK_MATRIX_MULTIPLICATION_H_INCLUDED
#define DLK_MATRIX_MULTIPLICATION_H_INCLUDED

#include "global.h"
#include "matrix_view.h"
#include "time_measurement.h"

namespace dlk {

namespace details {

constexpr std::size_t MAX_UNROLL = 16; // hard coded, not configurable

void matrix_multiplication_col3(
MatrixView<float, MatrixOrder::RowMajor>& A,
MatrixView<float, MatrixOrder::ColMajor>& B,
MatrixView<float, MatrixOrder::ColMajor>& C);
MatrixView<float, MatrixOrder::ColMajor>& C,
BYTE *temporary_buf);

void matrix_multiplication_impl(
MatrixView<float, MatrixOrder::RowMajor>& A,
MatrixView<float, MatrixOrder::ColMajor>& B,
MatrixView<float, MatrixOrder::ColMajor>& C);
MatrixView<float, MatrixOrder::RowMajor>& A,
MatrixView<float, MatrixOrder::ColMajor>& B,
MatrixView<float, MatrixOrder::ColMajor>& C,
BYTE *temporary_buf);

} // namespace details

Expand All @@ -40,21 +45,22 @@ template<typename T, typename U, typename V>
void matrix_multiplication(
MatrixView<T, MatrixOrder::RowMajor>& A,
MatrixView<U, MatrixOrder::ColMajor>& B,
MatrixView<V, MatrixOrder::ColMajor>& C) {
MatrixView<V, MatrixOrder::ColMajor>& C,
BYTE *temporary_buf) {

assert(A.cols() == B.rows());
Measurement::Start("matrix_multiplication");

#ifdef USE_NEON
if (A.cols() == 3 && A.rows() % 4 == 0) {
details::matrix_multiplication_col3(A, B, C);
details::matrix_multiplication_col3(A, B, C, temporary_buf);
} else {
details::matrix_multiplication_impl(A, B, C);
details::matrix_multiplication_impl(A, B, C, temporary_buf);
}
Measurement::Stop();
return;
#elif defined USE_AVX
details::matrix_multiplication_impl(A, B, C);
details::matrix_multiplication_impl(A, B, C, temporary_buf);
Measurement::Stop();
return;
#endif
Expand Down
14 changes: 14 additions & 0 deletions dlk/python/dlk/templates/include/network.tpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License.
#ifndef NETWORK_H_INCLUDED
#define NETWORK_H_INCLUDED

#include <memory>
#include "global.h"
#include "dma_buffer.h"

Expand Down Expand Up @@ -60,6 +61,10 @@ class SYM_PUBLIC Network
QUANTIZED_PACKED *device_input_buf = 0;
BIN_CONV_OUTPUT *device_output_buf = 0;

std::unique_ptr<BYTE[]> qconv_tmp_buffer;
std::unique_ptr<BYTE[]> conv_tmp_buffer;
std::unique_ptr<BYTE[]> quantize_tmp_buffer;

const T_INT input_rank = {{ graph_input.rank }};
const T_INT input_shape[{{ graph_input.rank }}] = { {{ graph_input.view.shape_as_cpp }} };

Expand Down Expand Up @@ -98,6 +103,15 @@ class SYM_PUBLIC Network
{% endfor -%}
const uint32_t total_thresholds_size = std::max(1, {{th_offset.o}});
#endif // RUN_ON_FPGA
{% for qconv in graph.convs(quantized_only=True) -%}
{% if qconv.has_thresholds -%}
{% set b = 32 -%}
{% set channels_padded = qconv.channel + (b - qconv.channel % b) % b -%}
const std::unique_ptr<BIN_CONV_OUTPUT[]> {{qconv.name}}_thresholds_converted = std::make_unique<BIN_CONV_OUTPUT[]>({{channels_padded}} * NUM_OF_A2W1_THRESHOLD);
{% else -%}
const std::unique_ptr<BIN_CONV_OUTPUT[]> {{qconv.name}}_thresholds_converted;
{% endif -%}
{% endfor -%}
};

#endif // NETWORK_H_INCLUDED
Expand Down
1 change: 1 addition & 0 deletions dlk/python/dlk/templates/include/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ struct convolution_parameters {
T_UINT stride_along_height;
T_UINT stride_along_width;
T_UINT padding;
BYTE *temporary_buf;
};

struct binary_convolution_parameters {
Expand Down
6 changes: 4 additions & 2 deletions dlk/python/dlk/templates/include/quantizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,14 @@ void func_QTZ_linear_mid_tread_half(
const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
const TensorView<T_INT, MemoryLayout::Atom>& nbit,
const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output);
const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output,
BYTE *temporary_buf);

void func_QTZ_linear_mid_tread_half(
const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
const TensorView<T_INT, MemoryLayout::Atom>& nbit,
const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
const TensorView<T_FLOAT, MemoryLayout::NHWC>& output);
const TensorView<T_FLOAT, MemoryLayout::NHWC>& output,
BYTE *temporary_buf);

#endif // QUANTIZER_H_INCLUDED
25 changes: 14 additions & 11 deletions dlk/python/dlk/templates/src/func/conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ void conv3x3_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
assert(p.input_height > 0);
assert(p.input_width > 0);

static U buf[MAX_SIZE_KN2ROW_BUFFER_PER_LAYER];
T* buf = reinterpret_cast<T*>(p.temporary_buf) + MAX_SIZE_KERNELS_PER_LAYER; // offset comes from kernel layout convert buffer
BYTE* matmul_buf = reinterpret_cast<BYTE*>(buf + MAX_SIZE_KN2ROW_BUFFER_PER_LAYER);

Measurement::Stop();

Expand All @@ -66,7 +67,7 @@ void conv3x3_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
auto input_ = dlk::MatrixView<T, dlk::MatrixOrder::ColMajor>(input.data() + ic * offset, ic, col_block);
auto buf_ = dlk::MatrixView<U, dlk::MatrixOrder::ColMajor>(buf, oc * kh * kw, col_block);

dlk::matrix_multiplication(kernels_, input_, buf_);
dlk::matrix_multiplication(kernels_, input_, buf_, matmul_buf);
dlk::matrix_shift_add(buf_, output_, p, offset);
}

Expand Down Expand Up @@ -108,16 +109,18 @@ void conv1x1_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
Measurement::Start("kn2row-1x1");


assert(p.input_height > 0);
assert(p.input_width > 0);
assert(p.input_height > 0);
assert(p.input_width > 0);

auto kernels_ = dlk::MatrixView<T, dlk::MatrixOrder::RowMajor>(kernels.data(), oc * kh * kw, ic);
auto input_ = dlk::MatrixView<T, dlk::MatrixOrder::ColMajor>(input.data(), ic, p.input_height * p.input_width);
auto output_ = dlk::MatrixView<U, dlk::MatrixOrder::ColMajor>(output.data(), oc, p.input_height * p.input_width);
auto kernels_ = dlk::MatrixView<T, dlk::MatrixOrder::RowMajor>(kernels.data(), oc * kh * kw, ic);
auto input_ = dlk::MatrixView<T, dlk::MatrixOrder::ColMajor>(input.data(), ic, p.input_height * p.input_width);
auto output_ = dlk::MatrixView<U, dlk::MatrixOrder::ColMajor>(output.data(), oc, p.input_height * p.input_width);

dlk::matrix_multiplication(kernels_, input_, output_);
// offset comes from kernel layout convert buffer
BYTE* matmul_buf = p.temporary_buf + MAX_SIZE_KERNELS_PER_LAYER * sizeof(T);
dlk::matrix_multiplication(kernels_, input_, output_, matmul_buf);

Measurement::Stop();
Measurement::Stop();
}

template<typename T>
Expand Down Expand Up @@ -183,15 +186,15 @@ void convolution(
return;
} else if (p.kernel_height == 3 && p.kernel_width == 3 && p.padding == 1) {
int kernels_size = p.kernel_height * p.kernel_width * p.kernel_depth * p.output_channels;
const auto kernels_hwoi_buf = std::make_unique<T[]>(kernels_size);
T* buf = reinterpret_cast<T*>(p.temporary_buf);
using hwoi_t = TensorView<T, MemoryLayout::HWOI>;
typename hwoi_t::template tensor_info_t<std::size_t> hwoi_shape = {
p.kernel_height,
p.kernel_width,
p.output_channels,
p.kernel_depth
};
hwoi_t kernels_hwoi(kernels_hwoi_buf.get(), hwoi_shape);
hwoi_t kernels_hwoi(buf, hwoi_shape);
ohwi_to_hwoi(kernels, kernels_hwoi, p);
conv3x3_kn2row(input, kernels_hwoi, output, p);
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ namespace dlk {

namespace impl {

static auto buf_th = std::make_unique<BIN_CONV_OUTPUT[]>(NUM_OF_A2W1_THRESHOLD * MAX_IN_C);

void pack_input_for_tiling(const TensorView<QUANTIZED_NOT_PACKED, MemoryLayout::NHWC>& input,
const tiling_input_t& output) {
Measurement::Start("Pack_input_for_tiling");
Expand Down Expand Up @@ -72,6 +70,33 @@ void pack_input_for_tiling(const TensorView<QUANTIZED_NOT_PACKED, MemoryLayout::
Measurement::Stop();
}

void convert_thresholds(BIN_CONV_OUTPUT *input, BIN_CONV_OUTPUT *output, std::size_t channels) {
std::size_t i = 0;
for (; i + 8 <= channels; i += 8) {
const auto v = vld4q_s16(input + NUM_OF_A2W1_THRESHOLD * i);
const auto is_neg = vreinterpretq_s16_u16(vmvnq_u16(vcgeq_s16(v.val[3], vdupq_n_s16(0))));
int16x8x4_t res;
res.val[0] = vsubq_s16(v.val[0], is_neg);
res.val[1] = vsubq_s16(v.val[1], is_neg);
res.val[2] = vsubq_s16(v.val[2], is_neg);
res.val[3] = v.val[3];
vst4q_s16(output + NUM_OF_A2W1_THRESHOLD * i, res);
}
for (; i < channels; ++i) {
BIN_CONV_OUTPUT v0 = input[NUM_OF_A2W1_THRESHOLD * i + 0];
BIN_CONV_OUTPUT v1 = input[NUM_OF_A2W1_THRESHOLD * i + 1];
BIN_CONV_OUTPUT v2 = input[NUM_OF_A2W1_THRESHOLD * i + 2];
const BIN_CONV_OUTPUT flg = input[NUM_OF_A2W1_THRESHOLD * i + 3];
if (flg < 0) {
--v0; --v1; --v2;
}
output[NUM_OF_A2W1_THRESHOLD * i + 0] = v0;
output[NUM_OF_A2W1_THRESHOLD * i + 1] = v1;
output[NUM_OF_A2W1_THRESHOLD * i + 2] = v2;
output[NUM_OF_A2W1_THRESHOLD * i + 3] = flg;
}
}

void QuantizedConv2DTiling(const tiling_input_t& input,
const kernel_t& kernel,
const binary_convolution_parameters &p) {
Expand All @@ -95,18 +120,6 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
assert((in_channels % InTypeBitWidth) == 0);

Measurement::Start("Quantized Conv2D Tiling");
if (p.thresholds != nullptr) {
for (T_UINT i = 0; i < out_channels; i += 8) {
const auto v = vld4q_s16(p.thresholds + NUM_OF_A2W1_THRESHOLD * i);
const auto is_neg = vreinterpretq_s16_u16(vmvnq_u16(vcgeq_s16(v.val[3], vdupq_n_s16(0))));
int16x8x4_t res;
res.val[0] = vsubq_s16(v.val[0], is_neg);
res.val[1] = vsubq_s16(v.val[1], is_neg);
res.val[2] = vsubq_s16(v.val[2], is_neg);
res.val[3] = v.val[3];
vst4q_s16(buf_th.get() + NUM_OF_A2W1_THRESHOLD * i, res);
}
}
constexpr uint8_t coeff_ary[16] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
Expand Down Expand Up @@ -264,7 +277,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
}
if (p.thresholds != nullptr) {
#define LOAD_TH(k) \
const auto ts##k = vld4q_s16(buf_th.get() + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \
const auto ts##k = vld4q_s16(p.thresholds + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \
const auto is_neg##k = vreinterpretq_s16_u16(vcltq_s16(ts##k.val[3], vdupq_n_s16(0))); \
const auto m2_##k = vsubq_s16(ts##k.val[3], vdupq_n_s16(2)); \
const auto is_const##k = vcgeq_s16(m2_##k, vdupq_n_s16(0));
Expand Down Expand Up @@ -529,7 +542,7 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
}
if (p.thresholds != nullptr) {
#define LOAD_TH(k) \
const auto ts##k = vld4q_s16(buf_th.get() + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \
const auto ts##k = vld4q_s16(p.thresholds + NUM_OF_A2W1_THRESHOLD * (out_ch_high * OutChUnroll2 + Om + 8 * k)); \
const auto is_neg##k = vreinterpretq_s16_u16(vcltq_s16(ts##k.val[3], vdupq_n_s16(0))); \
const auto m2_##k = vsubq_s16(ts##k.val[3], vdupq_n_s16(2)); \
const auto is_const##k = vcgeq_s16(m2_##k, vdupq_n_s16(0));
Expand Down
Loading