Skip to content
This repository has been archived by the owner on Dec 1, 2021. It is now read-only.

Experimental support for 5x5 convolution on CPU #683

Merged
merged 17 commits into from
Mar 5, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions dlk/python/dlk/templates/include/de10_nano.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,14 @@ Parameters calcParameters(uint32_t inputHeight, uint32_t inputWidth, uint32_t in
constexpr uint32_t maxBurst = 32;
constexpr uint32_t b = 32;

assert((kernelHeight == 3 && kernelWidth == 3) || (kernelHeight == 1 && kernelWidth == 1));

uint32_t pad = (kernelHeight == 1) ? 0 : 1;
constexpr std::size_t n_bit = 2;
constexpr std::size_t maxA = (1 << n_bit) - 1;
assert(kernelHeight == kernelWidth); // kernel rectangle must be square
assert(kernelHeight % 2 == 1); // kernel size must be odd
assert(1 <= kernelHeight && kernelHeight <= 3); // Currently, only 1x1, 3x3 conv are supported
assert(inputChannels * kernelHeight * kernelWidth * maxA <= std::numeric_limits<BIN_CONV_OUTPUT>::max()); // overflow check

uint32_t pad = kernelHeight / 2;
uint32_t dep = kernelHeight - 1;

auto outputHeight = inputHeight + 2 * pad - dep;
Expand Down Expand Up @@ -235,15 +240,8 @@ Parameters calcParameters(uint32_t inputHeight, uint32_t inputWidth, uint32_t in
p.a2fKernelVCount = kernelHeight;
p.a2fKernelHCount = kernelWidth;

if (kernelHeight == 1) {
p.a2fTileStep = 1u;
p.a2fTileGap = 1u;
}
else {
// TODO: 3x3 stride one assumed here
p.a2fTileStep = 1u;
p.a2fTileGap = 3u;
}
p.a2fTileStep = 1u; // stride one assumed
p.a2fTileGap = kernelHeight; // stride one assumed

p.a2fOutputHCount = hCount;
p.a2fOutputWCount = wCount;
Expand Down
70 changes: 35 additions & 35 deletions dlk/python/dlk/templates/include/func/quantized_conv2d.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,49 +48,49 @@ void QuantizedConv2D(
T_UINT iw = p.normal_conv_params.input_width;
T_UINT ic = p.normal_conv_params.kernel_depth;
T_UINT oc = p.normal_conv_params.output_channels;
T_UINT maxa = (1 << p.n_bit) - 1;
auto size = oc * ih * iw;
if (p.device_output_buf == nullptr)
p.device_output_buf = new BIN_CONV_OUTPUT[size]();

if ((kh == 3 && kw == 3 && padding == 1) ||
(kh == 1 && kw == 1 && padding == 0)) {
assert(kh == kw); // kernel rectangle must be square
assert(kh % 2 == 1); // kernel size must be odd
assert(1 <= kh && kh <= 5); // Only 1x1, 3x3, 5x5 are supported
assert(ic * kh * kw * maxa <= std::numeric_limits<BIN_CONV_OUTPUT>::max()); // overflow check
#ifdef RUN_ON_FPGA
dlk::impl::tca_input_t::tensor_info_t<std::size_t> shape = {
(ic + QUANTIZED_PACKED::BitCount - 1) / QUANTIZED_PACKED::BitCount,
ih,
iw,
p.bin_input_bitwidth,
QUANTIZED_PACKED::BitCount
};
dlk::impl::tca_input_t tmp((QUANTIZED_PACKED*)p.device_input_buf, shape);
convert_tensor(input, tmp);
dlk::impl::TCAConv2d(tmp, kernel, p);
dlk::impl::tca_input_t::tensor_info_t<std::size_t> shape = {
(ic + QUANTIZED_PACKED::BitCount - 1) / QUANTIZED_PACKED::BitCount,
ih,
iw,
p.bin_input_bitwidth,
QUANTIZED_PACKED::BitCount
};
dlk::impl::tca_input_t tmp((QUANTIZED_PACKED*)p.device_input_buf, shape);
convert_tensor(input, tmp);
dlk::impl::TCAConv2d(tmp, kernel, p);
#elif defined USE_NEON || defined USE_AVX
dlk::impl::tiling_input_t::tensor_info_t<std::size_t> shape = {
ic / TilingInTypeBitWidth,
ih,
iw,
p.bin_input_bitwidth,
TilingInTypeBitWidth
};
dlk::impl::tiling_input_t tmp(reinterpret_cast<dlk::impl::tiling_input_elem_t*>(p.device_input_buf), shape);
convert_tensor(input, tmp);
dlk::impl::QuantizedConv2DTiling(tmp, kernel, p);
dlk::impl::tiling_input_t::tensor_info_t<std::size_t> shape = {
ic / TilingInTypeBitWidth,
ih,
iw,
p.bin_input_bitwidth,
TilingInTypeBitWidth
};
dlk::impl::tiling_input_t tmp(reinterpret_cast<dlk::impl::tiling_input_elem_t*>(p.device_input_buf), shape);
convert_tensor(input, tmp);
dlk::impl::QuantizedConv2DTiling(tmp, kernel, p);
#else
dlk::impl::kn2row_input_t::tensor_info_t<std::size_t> shape = {
ih,
iw,
ic / QUANTIZED_PACKED::BitCount,
p.bin_input_bitwidth,
QUANTIZED_PACKED::BitCount
};
dlk::impl::kn2row_input_t tmp(reinterpret_cast<QUANTIZED_PACKED*>(p.device_input_buf), shape);
convert_tensor(input, tmp);
dlk::impl::QuantizedConv2DKn2Row(tmp, kernel, p);
dlk::impl::kn2row_input_t::tensor_info_t<std::size_t> shape = {
ih,
iw,
ic / QUANTIZED_PACKED::BitCount,
p.bin_input_bitwidth,
QUANTIZED_PACKED::BitCount
};
dlk::impl::kn2row_input_t tmp(reinterpret_cast<QUANTIZED_PACKED*>(p.device_input_buf), shape);
convert_tensor(input, tmp);
dlk::impl::QuantizedConv2DKn2Row(tmp, kernel, p);
#endif
} else {
throw std::invalid_argument("Unsupported convolution parameter");
}

Measurement::Stop();
}
Expand Down
97 changes: 19 additions & 78 deletions dlk/python/dlk/templates/include/matrix/shift_add.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,96 +22,37 @@ limitations under the License.

namespace dlk {

inline bool is_first_column(int j, int w) {
return (j % w == 0);
}

inline bool is_last_column(int j, int w) {
return (j % w == (w - 1));
}

// 3x3 matrix
/* A B C */
/* D E F */
/* G H I */

// is the right most column for the kernel matrix?
inline bool is_cfi(int i, int oc) {
return int(i / oc) == 2 or int(i / oc) == 5 or int(i / oc) == 8;
}

// is the left most column for the kernel matrix?
inline bool is_adg(int i, int oc) {
return int(i / oc) == 0 or int(i / oc) == 3 or int(i / oc) == 6;
}

// Note: this function is only for 3x3 kernel
inline int calc_offset(int i, int w) {
switch (i) {
case 0:
return w+1;
case 1:
return w;
case 2:
return w-1;
case 3:
return 1;
case 4:
return 0;
case 5:
return -1;
case 6:
return -w+1;
case 7:
return -w;
case 8:
return -w-1;
}

// must not come here
assert(false);
}

template<typename T>
void matrix_shift_add(MatrixView<T, MatrixOrder::ColMajor>& buf,
MatrixView<T, MatrixOrder::ColMajor>& result,
const struct convolution_parameters& p,
const int block_offset) {
Measurement::Start("matrix_shift_add1");

const int h = p.input_height;
const int w = p.input_width;
const int oc = p.output_channels;
const int kh = p.kernel_height;
const int kw = p.kernel_width;
const auto col_block = buf.cols();

// only 3x3 kernel is supported.
assert(kh == 3 && kw == 3);

for (unsigned int j = 0; j < col_block; ++j) {
for (unsigned int i = 0; i < buf.rows(); ++i) {
if (is_first_column(j + block_offset, w) && is_cfi(i, p.output_channels)) {
buf.set(i, j, 0);
} else if (is_last_column(j + block_offset, w) && is_adg(i, p.output_channels)) {
buf.set(i, j, 0);
}
}
}
Measurement::Start("matrix_shift_add");

Measurement::Stop();
const std::ptrdiff_t h = p.input_height;
const std::ptrdiff_t w = p.input_width;
const std::ptrdiff_t oc = p.output_channels;
const std::ptrdiff_t kh = p.kernel_height;
const std::ptrdiff_t kw = p.kernel_width;
const std::ptrdiff_t col_block = buf.cols();
const std::ptrdiff_t pad = p.padding;

Measurement::Start("matrix_shift_add2");
// only 3x3 or 5x5 kernel is supported.
assert(kh == kw);
assert(kh % 2 == 1);
assert(3 <= kh && kh <= 5);

for (int k = 0; k < col_block; ++k) {
const auto true_k = k + block_offset;
const auto row = true_k / w;
const auto col = true_k % w;
for (unsigned int i = 0; i < kh * kw; ++i) {
int offset = calc_offset(i, w);
if ((true_k + offset < 0) || (true_k + offset >= h * w)) {
continue;
}
int kr = i / kw;
int kc = i % kw;
if (row - kr + pad < 0 || row - kr + pad >= h || col - kc + pad < 0 || col - kc + pad >= w) continue;

T* r = result.data(0, true_k + offset);
int offset = (kr - pad) * w + (kc - pad);
T* r = result.data(0, true_k - offset);
T* b = buf.data(i*oc, k);

for (unsigned int j = 0; j < oc; ++j) {
Expand Down
9 changes: 5 additions & 4 deletions dlk/python/dlk/templates/src/func/conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ limitations under the License.
namespace {

template<typename T, typename U>
void conv3x3_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
void conv_nxn_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,
const TensorView<T, MemoryLayout::HWOI>& kernels,
const TensorView<U, MemoryLayout::NHWC>& output,
struct convolution_parameters& p) {
Expand All @@ -44,7 +44,8 @@ void conv3x3_kn2row(const TensorView<T, MemoryLayout::NHWC>& input,

// assertions
assert(ih * iw == oh * ow);
assert(kh == 3 && kw == 3);
assert(kh == kw);
assert(3 <= kh && kh <= 5);

// need to initialize output
std::memset(output.data(), 0, oc * ih * iw * sizeof(U));
Expand Down Expand Up @@ -184,7 +185,7 @@ void convolution(
int kernels_size = p.kernel_height * p.kernel_width * p.kernel_depth * p.output_channels;
conv1x1_kn2row(input, kernels, output, p);
return;
} else if (p.kernel_height == 3 && p.kernel_width == 3 && p.padding == 1) {
} else if (p.kernel_height == p.kernel_width && 3 <= p.kernel_height && p.kernel_height <= 5 && p.padding == p.kernel_height / 2) {
int kernels_size = p.kernel_height * p.kernel_width * p.kernel_depth * p.output_channels;
T* buf = reinterpret_cast<T*>(p.temporary_buf);
using hwoi_t = TensorView<T, MemoryLayout::HWOI>;
Expand All @@ -196,7 +197,7 @@ void convolution(
};
hwoi_t kernels_hwoi(buf, hwoi_shape);
ohwi_to_hwoi(kernels, kernels_hwoi, p);
conv3x3_kn2row(input, kernels_hwoi, output, p);
conv_nxn_kn2row(input, kernels_hwoi, output, p);
return;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ limitations under the License.

#include <cassert>
#include <climits>
#include <limits>

#include "global.h"
#include "func/impl/quantized_conv2d_tiling.h"
Expand Down Expand Up @@ -114,8 +115,10 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
const T_UINT out_height = cp.output_height;
const T_UINT out_width = cp.output_width;
const T_UINT out_size = out_height * out_width * out_channels;
const T_UINT maxa = (1 << in_bitwidth) - 1;

assert(kh * kw < 32);
assert(in_channels * kh * kw * maxa <= std::numeric_limits<BIN_CONV_OUTPUT>::max());
assert(in_height * in_width == out_height * out_width);
assert((in_channels % InTypeBitWidth) == 0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License.
#include <cassert>
#include <cstring>
#include <algorithm>
#include <limits>

#include "global.h"
#include "func/impl/quantized_conv2d_kn2row.h"
Expand Down Expand Up @@ -48,6 +49,7 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
T_UINT ow = p.normal_conv_params.output_width;
T_UINT kh = p.normal_conv_params.kernel_height;
T_UINT kw = p.normal_conv_params.kernel_width;
T_UINT maxa = (1 << p.n_bit) - 1;
BYTE *temp_buf_ptr = p.normal_conv_params.temporary_buf;

assert(ih * iw == oh * ow);
Expand All @@ -59,7 +61,13 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
out_buf, oc, ih * iw);
auto kernel_ = MatrixView<QUANTIZED_PACKED_KERNEL, MatrixOrder::RowMajor>(
kernel.data(), oc * kh * kw, ic / 32);
if (kh == kw && kw == 3) {

assert(kh == kw);
assert(kh % 2 == 1);
assert(1 <= kh && kh <= 5);
assert(ic * kh * kw * maxa <= std::numeric_limits<BIN_CONV_OUTPUT>::max());

if (kh >= 3) {
std::fill(out_buf, out_buf + oc * oh * ow, 0);
for (std::size_t offset = 0; offset < ih * iw; offset += MAX_SIZE_KN2ROW_COL_BLOCK) {
const auto col_block = std::min(static_cast<std::size_t>(MAX_SIZE_KN2ROW_COL_BLOCK), ih * iw - offset);
Expand All @@ -71,15 +79,12 @@ void QuantizedConv2DKn2Row(const kn2row_input_t& input,
quantized_matrix_multiplication(kernel_, input_, buf_);
matrix_shift_add(buf_, output_, p.normal_conv_params, offset);
}
} else if (kh == kw && kw == 1) {
} else {
auto input_ = MatrixView<QUANTIZED_PACKED, MatrixOrder::ColMajor>(
input.data(), ic / 16, ih * iw);
auto output_ = MatrixView<BIN_CONV_OUTPUT, MatrixOrder::ColMajor>(
out_buf, oc, ih * iw);
quantized_matrix_multiplication(kernel_, input_, output_);
} else {
std::cerr << "Only 1x1 or 3x3 convolutions are supported." << std::endl;
assert(false);
}

const auto out_size = oc * oh * ow;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ limitations under the License.

#include <cassert>
#include <climits>
#include <limits>

#include "global.h"
#include "func/impl/quantized_conv2d_tiling.h"
Expand Down Expand Up @@ -143,8 +144,10 @@ void QuantizedConv2DTiling(const tiling_input_t& input,
const std::size_t out_height = cp.output_height;
const std::size_t out_width = cp.output_width;
const std::size_t out_size = out_height * out_width * out_channels;
const std::size_t maxa = (1 << in_bitwidth) - 1;

//assert(kh * kw < 32);
assert(kh * kw < 32);
assert(in_channels * kh * kw * maxa <= std::numeric_limits<BIN_CONV_OUTPUT>::max());
assert(in_height * in_width == out_height * out_width);
assert((in_channels % InTypeBitWidth) == 0);

Expand Down
Loading