Skip to content
This repository has been archived by the owner on Dec 1, 2021. It is now read-only.

Use ChHWBCl instead of HWChBCl if can #729

Merged
merged 3 commits into from
Jan 28, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions dlk/python/dlk/core/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ def pass_quantize_convolutions(graph: Graph) -> None:
width = qtz.width
depth = qtz.channel
depth_upper = (depth + b - 1) // b
qtz.update_shape([height, width, depth_upper, 2, b], "HWChBCl")
qtz.update_shape([depth_upper, height, width, 2, b], "ChHWBCl")


def pass_propagate_datatypes(graph) -> None:
Expand Down Expand Up @@ -530,10 +530,6 @@ def pass_propagate_format(graph) -> None:
b = 32
shape = [(m.channel + b - 1) // b, m.height, m.width, 2, b]
m.update_shape(shape, m.input_nodes[0].dimension)
elif m.input_nodes[0].dimension == 'HWChBCl':
b = 32
shape = [m.height, m.width, (m.channel + b - 1) // b, 2, b]
m.update_shape(shape, m.input_nodes[0].dimension)


def pass_propagate_output_type_backward(graph: Graph) -> None:
Expand Down
2 changes: 1 addition & 1 deletion dlk/python/dlk/templates/include/pack_input_to_qwords.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ void pack_input_to_qwords(
struct binary_convolution_parameters bcp);


int pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
size_t bits_per_input, QUANTIZED_PACKED output[]);

#endif // DLK_PACK_INPUT_TO_QWORDS_H_INCLUDED
2 changes: 1 addition & 1 deletion dlk/python/dlk/templates/include/quantizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ void func_QTZ_linear_mid_tread_half(
const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
const TensorView<T_INT, MemoryLayout::Atom>& nbit,
const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output,
const TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>& output,
BYTE *temporary_buf);

void func_QTZ_linear_mid_tread_half(
Expand Down
126 changes: 62 additions & 64 deletions dlk/python/dlk/templates/src/pack_input_to_qwords.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,99 +24,97 @@ limitations under the License.
#include <x86intrin.h>
#endif

int pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
size_t bits_per_input, QUANTIZED_PACKED output[]) {

Measurement::Start("pack_input");
const int bits_per_word = sizeof(QUANTIZED_PACKED) * CHAR_BIT;
int full_words_in_depth = input_depth / bits_per_word;
int remainder_bits_in_depth = input_depth % bits_per_word;
int input_index = 0;
int current_word = 0;
constexpr size_t bits_per_word = sizeof(QUANTIZED_PACKED) * CHAR_BIT;
const size_t full_words_in_depth = input_depth / bits_per_word;
const size_t blocks_in_depth = (input_depth + bits_per_word - 1) / bits_per_word;
const size_t remainder_bits_in_depth = input_depth % bits_per_word;

auto len = input_height * input_width * input_depth;
const auto area = input_height * input_width;
#ifdef USE_NEON
if (input_depth % 32 == 0) {
if (bits_per_input == 2 && input_depth % bits_per_word == 0) {
const uint8_t coeff_ary[16] = {
1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128,
};
const auto coeff = vld1q_u8(coeff_ary);
const auto vone = vdupq_n_u8(1);
constexpr int b = 32;
constexpr int n_bits = 2;
const auto blocks = len / b;
#pragma omp parallel for
for (int i = 0; i < blocks; ++i) {
const auto v0 = vld1q_u8(input + i * b + 0);
const auto v1 = vld1q_u8(input + i * b + 16);
const auto l0 = vandq_u8(v0, vone);
const auto l1 = vandq_u8(v1, vone);
const auto m0 = vshrq_n_u8(v0, 1);
const auto m1 = vshrq_n_u8(v1, 1);
const auto ml0 = vmulq_u8(l0, coeff);
const auto ml1 = vmulq_u8(l1, coeff);
const auto mm0 = vmulq_u8(m0, coeff);
const auto mm1 = vmulq_u8(m1, coeff);
const auto al0 = vpadd_u8(vget_low_u8(ml0), vget_high_u8(ml0));
const auto al1 = vpadd_u8(vget_low_u8(ml1), vget_high_u8(ml1));
const auto am0 = vpadd_u8(vget_low_u8(mm0), vget_high_u8(mm0));
const auto am1 = vpadd_u8(vget_low_u8(mm1), vget_high_u8(mm1));
const auto bl = vpadd_u8(al0, al1);
const auto bm = vpadd_u8(am0, am1);
const auto c = vpadd_u8(bl, bm);
vst1_u8(reinterpret_cast<uint8_t*>(output + i * n_bits), c);
for (size_t i = 0; i < area; ++i) {
for (size_t j = 0; j < full_words_in_depth; ++j) {
const auto v0 = vld1q_u8(input + i*blocks_in_depth*bits_per_word + j*bits_per_word + 0);
const auto v1 = vld1q_u8(input + i*blocks_in_depth*bits_per_word + j*bits_per_word + 16);
const auto l0 = vandq_u8(v0, vone);
const auto l1 = vandq_u8(v1, vone);
const auto m0 = vshrq_n_u8(v0, 1);
const auto m1 = vshrq_n_u8(v1, 1);
const auto ml0 = vmulq_u8(l0, coeff);
const auto ml1 = vmulq_u8(l1, coeff);
const auto mm0 = vmulq_u8(m0, coeff);
const auto mm1 = vmulq_u8(m1, coeff);
const auto al0 = vpadd_u8(vget_low_u8(ml0), vget_high_u8(ml0));
const auto al1 = vpadd_u8(vget_low_u8(ml1), vget_high_u8(ml1));
const auto am0 = vpadd_u8(vget_low_u8(mm0), vget_high_u8(mm0));
const auto am1 = vpadd_u8(vget_low_u8(mm1), vget_high_u8(mm1));
const auto bl = vpadd_u8(al0, al1);
const auto bm = vpadd_u8(am0, am1);
const auto c = vpadd_u8(bl, bm);
vst1_u8(reinterpret_cast<uint8_t*>(output + i*bits_per_input + j*area*bits_per_input), c);
}
}
Measurement::Stop();
return 0;
return;
}
#endif

#ifdef USE_AVX
constexpr std::size_t SIMD_WIDTH = 32;
if ((input_depth % SIMD_WIDTH) == 0) {
const auto blocks = len / SIMD_WIDTH;
for (int i = 0; i < blocks; ++i) {
const auto a = _mm256_loadu_si256(reinterpret_cast<__m256i*>(input + i * SIMD_WIDTH));
const auto l = _mm256_movemask_epi8(_mm256_slli_epi16(a, 7));
const auto m = _mm256_movemask_epi8(_mm256_slli_epi16(a, 6));
output[i*2 + 0] = QUANTIZED_PACKED(l);
output[i*2 + 1] = QUANTIZED_PACKED(m);
if (bits_per_input == 2 && input_depth % bits_per_word == 0) {
#pragma omp parallel for
for (size_t i = 0; i < area; ++i) {
for (size_t j = 0; j < full_words_in_depth; ++j) {
const auto a = _mm256_loadu_si256(reinterpret_cast<__m256i*>(input + i*blocks_in_depth*bits_per_word + j*bits_per_word));
const auto l = _mm256_movemask_epi8(_mm256_slli_epi16(a, 7));
const auto m = _mm256_movemask_epi8(_mm256_slli_epi16(a, 6));
output[i*bits_per_input + j*area*bits_per_input + 0] = QUANTIZED_PACKED(l);
output[i*bits_per_input + j*area*bits_per_input + 1] = QUANTIZED_PACKED(m);
}
}
Measurement::Stop();
return 0;
return;
}
#endif

for (int h = 0; h < input_height; ++h)
for (int w = 0; w < input_width; ++w) {
for (int d = 0; d < full_words_in_depth; ++d) {
output[current_word] = QUANTIZED_PACKED(0);
output[current_word + 1] = QUANTIZED_PACKED(0);
for (int d = 0; d < bits_per_word; ++d) {
for(int b = 0; b < bits_per_input; ++b)
output[current_word + b] = QUANTIZED_PACKED(output[current_word + b].Raw() | ((input[input_index] >> b) & 1) << d);
input_index++;
}
current_word += bits_per_input;
}
for (size_t i = 0; i < area; ++i) {
for (size_t j = 0; j < full_words_in_depth; ++j) {
for (size_t b = 0; b < bits_per_input; ++b) {
QUANTIZED_PACKED tmp(0);
for (size_t d = 0; d < bits_per_word; ++d) {
QUANTIZED_PACKED::base_t in = input[i*input_depth + j*bits_per_word + d];
tmp |= QUANTIZED_PACKED(((in >> b) & 1) << d);
}
output[i*bits_per_input + j*area*bits_per_input + b] = tmp;
}
}

if(!remainder_bits_in_depth)
continue;
if (!remainder_bits_in_depth)
continue;

output[current_word] = QUANTIZED_PACKED(0);
output[current_word + 1] = QUANTIZED_PACKED(0);
for (int d = 0; d < remainder_bits_in_depth; ++d) {
for(int b = 0; b < bits_per_input; ++b)
output[current_word + b] = QUANTIZED_PACKED(output[current_word + b].Raw() | ((input[input_index] >> b) & 1) << d);
input_index++;
}
current_word += bits_per_input;
for (size_t b = 0; b < bits_per_input; ++b) {
QUANTIZED_PACKED tmp(0);
for (size_t d = 0; d < remainder_bits_in_depth; ++d) {
QUANTIZED_PACKED::base_t in = input[i*input_depth + full_words_in_depth*bits_per_word + d];
tmp |= QUANTIZED_PACKED(((in >> b) & 1) << d);
}
output[i*bits_per_input + full_words_in_depth*area*bits_per_input + b] = tmp;
}
}

Measurement::Stop();

return current_word;
return;
}

void pack_input_to_qwords(
Expand Down
2 changes: 1 addition & 1 deletion dlk/python/dlk/templates/src/quantizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ void func_QTZ_linear_mid_tread_half(
const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
const TensorView<T_INT, MemoryLayout::Atom>& nbit,
const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output,
const TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>& output,
BYTE *temporary_buf) {
Measurement::Start("QTZ_linear_mid_tread_half");

Expand Down