From 3c030daa6fc3622df0a2104027544cbc984df7c6 Mon Sep 17 00:00:00 2001 From: primenumber Date: Wed, 15 Jan 2020 16:26:26 +0900 Subject: [PATCH 1/2] Use ChHWBCl instead of HWChBCl --- dlk/python/dlk/core/optimizer.py | 6 +- .../templates/include/pack_input_to_qwords.h | 2 +- dlk/python/dlk/templates/include/quantizer.h | 2 +- .../templates/src/pack_input_to_qwords.cpp | 126 +++++++++--------- dlk/python/dlk/templates/src/quantizer.cpp | 2 +- 5 files changed, 66 insertions(+), 72 deletions(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 9c1046d3c..ab2332627 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -500,7 +500,7 @@ def pass_quantize_convolutions(graph: Graph) -> None: width = qtz.width depth = qtz.channel depth_upper = (depth + b - 1) // b - qtz.update_shape([height, width, depth_upper, 2, b], "HWChBCl") + qtz.update_shape([depth_upper, height, width, 2, b], "ChHWBCl") def pass_propagate_datatypes(graph) -> None: @@ -530,10 +530,6 @@ def pass_propagate_format(graph) -> None: b = 32 shape = [(m.channel + b - 1) // b, m.height, m.width, 2, b] m.update_shape(shape, m.input_nodes[0].dimension) - elif m.input_nodes[0].dimension == 'HWChBCl': - b = 32 - shape = [m.height, m.width, (m.channel + b - 1) // b, 2, b] - m.update_shape(shape, m.input_nodes[0].dimension) def pass_propagate_output_type_backward(graph: Graph) -> None: diff --git a/dlk/python/dlk/templates/include/pack_input_to_qwords.h b/dlk/python/dlk/templates/include/pack_input_to_qwords.h index daed9236d..5ed8f0ed1 100644 --- a/dlk/python/dlk/templates/include/pack_input_to_qwords.h +++ b/dlk/python/dlk/templates/include/pack_input_to_qwords.h @@ -30,7 +30,7 @@ void pack_input_to_qwords( struct binary_convolution_parameters bcp); -int pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth, +void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth, size_t bits_per_input, QUANTIZED_PACKED output[]); #endif // DLK_PACK_INPUT_TO_QWORDS_H_INCLUDED diff --git a/dlk/python/dlk/templates/include/quantizer.h b/dlk/python/dlk/templates/include/quantizer.h index d3846e3f8..6e98ff828 100644 --- a/dlk/python/dlk/templates/include/quantizer.h +++ b/dlk/python/dlk/templates/include/quantizer.h @@ -46,7 +46,7 @@ void func_QTZ_linear_mid_tread_half( const TensorView& input, const TensorView& nbit, const TensorView& max_value, - const TensorView& output, + const TensorView& output, BYTE *temporary_buf); void func_QTZ_linear_mid_tread_half( diff --git a/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp b/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp index ae8610488..5342058c8 100644 --- a/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp +++ b/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp @@ -24,99 +24,97 @@ limitations under the License. #include #endif -int pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth, +void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth, size_t bits_per_input, QUANTIZED_PACKED output[]) { Measurement::Start("pack_input"); - const int bits_per_word = sizeof(QUANTIZED_PACKED) * CHAR_BIT; - int full_words_in_depth = input_depth / bits_per_word; - int remainder_bits_in_depth = input_depth % bits_per_word; - int input_index = 0; - int current_word = 0; + constexpr size_t bits_per_word = sizeof(QUANTIZED_PACKED) * CHAR_BIT; + const size_t full_words_in_depth = input_depth / bits_per_word; + const size_t blocks_in_depth = (input_depth + bits_per_word - 1) / bits_per_word; + const size_t remainder_bits_in_depth = input_depth % bits_per_word; - auto len = input_height * input_width * input_depth; + const auto area = input_height * input_width; #ifdef USE_NEON - if (input_depth % 32 == 0) { + if (bits_per_input == 2 && input_depth % bits_per_word == 0) { const uint8_t coeff_ary[16] = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, }; const auto coeff = vld1q_u8(coeff_ary); const auto vone = vdupq_n_u8(1); - constexpr int b = 32; - constexpr int n_bits = 2; - const auto blocks = len / b; #pragma omp parallel for - for (int i = 0; i < blocks; ++i) { - const auto v0 = vld1q_u8(input + i * b + 0); - const auto v1 = vld1q_u8(input + i * b + 16); - const auto l0 = vandq_u8(v0, vone); - const auto l1 = vandq_u8(v1, vone); - const auto m0 = vshrq_n_u8(v0, 1); - const auto m1 = vshrq_n_u8(v1, 1); - const auto ml0 = vmulq_u8(l0, coeff); - const auto ml1 = vmulq_u8(l1, coeff); - const auto mm0 = vmulq_u8(m0, coeff); - const auto mm1 = vmulq_u8(m1, coeff); - const auto al0 = vpadd_u8(vget_low_u8(ml0), vget_high_u8(ml0)); - const auto al1 = vpadd_u8(vget_low_u8(ml1), vget_high_u8(ml1)); - const auto am0 = vpadd_u8(vget_low_u8(mm0), vget_high_u8(mm0)); - const auto am1 = vpadd_u8(vget_low_u8(mm1), vget_high_u8(mm1)); - const auto bl = vpadd_u8(al0, al1); - const auto bm = vpadd_u8(am0, am1); - const auto c = vpadd_u8(bl, bm); - vst1_u8(reinterpret_cast(output + i * n_bits), c); + for (size_t i = 0; i < area; ++i) { + for (size_t j = 0; j < full_words_in_depth; ++j) { + const auto v0 = vld1q_u8(input + i*blocks_in_depth*bits_per_word + j*bits_per_word + 0); + const auto v1 = vld1q_u8(input + i*blocks_in_depth*bits_per_word + j*bits_per_word + 16); + const auto l0 = vandq_u8(v0, vone); + const auto l1 = vandq_u8(v1, vone); + const auto m0 = vshrq_n_u8(v0, 1); + const auto m1 = vshrq_n_u8(v1, 1); + const auto ml0 = vmulq_u8(l0, coeff); + const auto ml1 = vmulq_u8(l1, coeff); + const auto mm0 = vmulq_u8(m0, coeff); + const auto mm1 = vmulq_u8(m1, coeff); + const auto al0 = vpadd_u8(vget_low_u8(ml0), vget_high_u8(ml0)); + const auto al1 = vpadd_u8(vget_low_u8(ml1), vget_high_u8(ml1)); + const auto am0 = vpadd_u8(vget_low_u8(mm0), vget_high_u8(mm0)); + const auto am1 = vpadd_u8(vget_low_u8(mm1), vget_high_u8(mm1)); + const auto bl = vpadd_u8(al0, al1); + const auto bm = vpadd_u8(am0, am1); + const auto c = vpadd_u8(bl, bm); + vst1_u8(reinterpret_cast(output + i*bits_per_input + j*area*bits_per_input), c); + } } Measurement::Stop(); - return 0; + return; } #endif #ifdef USE_AVX - constexpr std::size_t SIMD_WIDTH = 32; - if ((input_depth % SIMD_WIDTH) == 0) { - const auto blocks = len / SIMD_WIDTH; - for (int i = 0; i < blocks; ++i) { - const auto a = _mm256_loadu_si256(reinterpret_cast<__m256i*>(input + i * SIMD_WIDTH)); - const auto l = _mm256_movemask_epi8(_mm256_slli_epi16(a, 7)); - const auto m = _mm256_movemask_epi8(_mm256_slli_epi16(a, 6)); - output[i*2 + 0] = QUANTIZED_PACKED(l); - output[i*2 + 1] = QUANTIZED_PACKED(m); + if (bits_per_input == 2 && input_depth % bits_per_word == 0) { +#pragma omp parallel for + for (size_t i = 0; i < area; ++i) { + for (size_t j = 0; j < full_words_in_depth; ++j) { + const auto a = _mm256_loadu_si256(reinterpret_cast<__m256i*>(input + i*blocks_in_depth*bits_per_word + j*bits_per_word)); + const auto l = _mm256_movemask_epi8(_mm256_slli_epi16(a, 7)); + const auto m = _mm256_movemask_epi8(_mm256_slli_epi16(a, 6)); + output[i*bits_per_input + j*area*bits_per_input + 0] = QUANTIZED_PACKED(l); + output[i*bits_per_input + j*area*bits_per_input + 1] = QUANTIZED_PACKED(m); + } } Measurement::Stop(); - return 0; + return; } #endif - for (int h = 0; h < input_height; ++h) - for (int w = 0; w < input_width; ++w) { - for (int d = 0; d < full_words_in_depth; ++d) { - output[current_word] = QUANTIZED_PACKED(0); - output[current_word + 1] = QUANTIZED_PACKED(0); - for (int d = 0; d < bits_per_word; ++d) { - for(int b = 0; b < bits_per_input; ++b) - output[current_word + b] = QUANTIZED_PACKED(output[current_word + b].Raw() | ((input[input_index] >> b) & 1) << d); - input_index++; - } - current_word += bits_per_input; - } + for (size_t i = 0; i < area; ++i) { + for (size_t j = 0; j < full_words_in_depth; ++j) { + for(size_t b = 0; b < bits_per_input; ++b) { + QUANTIZED_PACKED tmp(0); + for (size_t d = 0; d < bits_per_word; ++d) { + QUANTIZED_PACKED::base_t in = input[i*input_depth + j*bits_per_word + d]; + tmp |= QUANTIZED_PACKED(((in >> b) & 1) << d); + } + output[i*bits_per_input + j*area*bits_per_input + b] = tmp; + } + } - if(!remainder_bits_in_depth) - continue; + if(!remainder_bits_in_depth) + continue; - output[current_word] = QUANTIZED_PACKED(0); - output[current_word + 1] = QUANTIZED_PACKED(0); - for (int d = 0; d < remainder_bits_in_depth; ++d) { - for(int b = 0; b < bits_per_input; ++b) - output[current_word + b] = QUANTIZED_PACKED(output[current_word + b].Raw() | ((input[input_index] >> b) & 1) << d); - input_index++; - } - current_word += bits_per_input; + for(size_t b = 0; b < bits_per_input; ++b) { + QUANTIZED_PACKED tmp(0); + for (size_t d = 0; d < remainder_bits_in_depth; ++d) { + QUANTIZED_PACKED::base_t in = input[i*input_depth + full_words_in_depth*bits_per_word + d]; + tmp |= QUANTIZED_PACKED(((in >> b) & 1) << d); } + output[i*bits_per_input + full_words_in_depth*area*bits_per_input + b] = tmp; + } + } Measurement::Stop(); - return current_word; + return; } void pack_input_to_qwords( diff --git a/dlk/python/dlk/templates/src/quantizer.cpp b/dlk/python/dlk/templates/src/quantizer.cpp index ee8feb751..a33d4dcd8 100644 --- a/dlk/python/dlk/templates/src/quantizer.cpp +++ b/dlk/python/dlk/templates/src/quantizer.cpp @@ -120,7 +120,7 @@ void func_QTZ_linear_mid_tread_half( const TensorView& input, const TensorView& nbit, const TensorView& max_value, - const TensorView& output, + const TensorView& output, BYTE *temporary_buf) { Measurement::Start("QTZ_linear_mid_tread_half"); From 43b67edc607288156bef4cbb3bd360a3345352ca Mon Sep 17 00:00:00 2001 From: primenumber Date: Tue, 28 Jan 2020 12:20:50 +0900 Subject: [PATCH 2/2] Just formatting --- dlk/python/dlk/templates/src/pack_input_to_qwords.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp b/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp index 5342058c8..2727478b7 100644 --- a/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp +++ b/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp @@ -89,7 +89,7 @@ void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_ for (size_t i = 0; i < area; ++i) { for (size_t j = 0; j < full_words_in_depth; ++j) { - for(size_t b = 0; b < bits_per_input; ++b) { + for (size_t b = 0; b < bits_per_input; ++b) { QUANTIZED_PACKED tmp(0); for (size_t d = 0; d < bits_per_word; ++d) { QUANTIZED_PACKED::base_t in = input[i*input_depth + j*bits_per_word + d]; @@ -99,10 +99,10 @@ void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_ } } - if(!remainder_bits_in_depth) + if (!remainder_bits_in_depth) continue; - for(size_t b = 0; b < bits_per_input; ++b) { + for (size_t b = 0; b < bits_per_input; ++b) { QUANTIZED_PACKED tmp(0); for (size_t d = 0; d < remainder_bits_in_depth; ++d) { QUANTIZED_PACKED::base_t in = input[i*input_depth + full_words_in_depth*bits_per_word + d];