From 3c030daa6fc3622df0a2104027544cbc984df7c6 Mon Sep 17 00:00:00 2001
From: primenumber <prime@kmc.gr.jp>
Date: Wed, 15 Jan 2020 16:26:26 +0900
Subject: [PATCH 1/2] Use ChHWBCl instead of HWChBCl

---
 dlk/python/dlk/core/optimizer.py              |   6 +-
 .../templates/include/pack_input_to_qwords.h  |   2 +-
 dlk/python/dlk/templates/include/quantizer.h  |   2 +-
 .../templates/src/pack_input_to_qwords.cpp    | 126 +++++++++---------
 dlk/python/dlk/templates/src/quantizer.cpp    |   2 +-
 5 files changed, 66 insertions(+), 72 deletions(-)

diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py
index 9c1046d3c..ab2332627 100644
--- a/dlk/python/dlk/core/optimizer.py
+++ b/dlk/python/dlk/core/optimizer.py
@@ -500,7 +500,7 @@ def pass_quantize_convolutions(graph: Graph) -> None:
             width = qtz.width
             depth = qtz.channel
             depth_upper = (depth + b - 1) // b
-            qtz.update_shape([height, width, depth_upper, 2, b], "HWChBCl")
+            qtz.update_shape([depth_upper, height, width, 2, b], "ChHWBCl")
 
 
 def pass_propagate_datatypes(graph) -> None:
@@ -530,10 +530,6 @@ def pass_propagate_format(graph) -> None:
                 b = 32
                 shape = [(m.channel + b - 1) // b, m.height, m.width, 2, b]
                 m.update_shape(shape, m.input_nodes[0].dimension)
-            elif m.input_nodes[0].dimension == 'HWChBCl':
-                b = 32
-                shape = [m.height, m.width, (m.channel + b - 1) // b, 2, b]
-                m.update_shape(shape, m.input_nodes[0].dimension)
 
 
 def pass_propagate_output_type_backward(graph: Graph) -> None:
diff --git a/dlk/python/dlk/templates/include/pack_input_to_qwords.h b/dlk/python/dlk/templates/include/pack_input_to_qwords.h
index daed9236d..5ed8f0ed1 100644
--- a/dlk/python/dlk/templates/include/pack_input_to_qwords.h
+++ b/dlk/python/dlk/templates/include/pack_input_to_qwords.h
@@ -30,7 +30,7 @@ void pack_input_to_qwords(
   struct binary_convolution_parameters bcp);
 
 
-int pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
+void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
   size_t bits_per_input, QUANTIZED_PACKED output[]);
 
 #endif // DLK_PACK_INPUT_TO_QWORDS_H_INCLUDED
diff --git a/dlk/python/dlk/templates/include/quantizer.h b/dlk/python/dlk/templates/include/quantizer.h
index d3846e3f8..6e98ff828 100644
--- a/dlk/python/dlk/templates/include/quantizer.h
+++ b/dlk/python/dlk/templates/include/quantizer.h
@@ -46,7 +46,7 @@ void func_QTZ_linear_mid_tread_half(
     const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
     const TensorView<T_INT, MemoryLayout::Atom>& nbit,
     const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
-    const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output,
+    const TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>& output,
     BYTE *temporary_buf);
 
 void func_QTZ_linear_mid_tread_half(
diff --git a/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp b/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp
index ae8610488..5342058c8 100644
--- a/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp
+++ b/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp
@@ -24,99 +24,97 @@ limitations under the License.
 #include <x86intrin.h>
 #endif
 
-int pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
+void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_width, size_t input_depth,
   size_t bits_per_input, QUANTIZED_PACKED output[]) {
 
   Measurement::Start("pack_input");
-  const int bits_per_word = sizeof(QUANTIZED_PACKED) * CHAR_BIT;
-  int full_words_in_depth = input_depth / bits_per_word;
-  int remainder_bits_in_depth = input_depth % bits_per_word;
-  int input_index = 0;
-  int current_word = 0;
+  constexpr size_t bits_per_word = sizeof(QUANTIZED_PACKED) * CHAR_BIT;
+  const size_t full_words_in_depth = input_depth / bits_per_word;
+  const size_t blocks_in_depth = (input_depth + bits_per_word - 1) / bits_per_word;
+  const size_t remainder_bits_in_depth = input_depth % bits_per_word;
 
-  auto len = input_height * input_width * input_depth;
+  const auto area = input_height * input_width;
 #ifdef USE_NEON
-  if (input_depth % 32 == 0) {
+  if (bits_per_input == 2 && input_depth % bits_per_word == 0) {
     const uint8_t coeff_ary[16] = {
       1, 2, 4, 8, 16, 32, 64, 128,
       1, 2, 4, 8, 16, 32, 64, 128,
     };
     const auto coeff = vld1q_u8(coeff_ary);
     const auto vone = vdupq_n_u8(1);
-    constexpr int b = 32;
-    constexpr int n_bits = 2;
-    const auto blocks = len / b;
 #pragma omp parallel for
-    for (int i = 0; i < blocks; ++i) {
-      const auto v0 = vld1q_u8(input + i * b +  0);
-      const auto v1 = vld1q_u8(input + i * b + 16);
-      const auto l0 = vandq_u8(v0, vone);
-      const auto l1 = vandq_u8(v1, vone);
-      const auto m0 = vshrq_n_u8(v0, 1);
-      const auto m1 = vshrq_n_u8(v1, 1);
-      const auto ml0 = vmulq_u8(l0, coeff);
-      const auto ml1 = vmulq_u8(l1, coeff);
-      const auto mm0 = vmulq_u8(m0, coeff);
-      const auto mm1 = vmulq_u8(m1, coeff);
-      const auto al0 = vpadd_u8(vget_low_u8(ml0), vget_high_u8(ml0));
-      const auto al1 = vpadd_u8(vget_low_u8(ml1), vget_high_u8(ml1));
-      const auto am0 = vpadd_u8(vget_low_u8(mm0), vget_high_u8(mm0));
-      const auto am1 = vpadd_u8(vget_low_u8(mm1), vget_high_u8(mm1));
-      const auto bl = vpadd_u8(al0, al1);
-      const auto bm = vpadd_u8(am0, am1);
-      const auto c = vpadd_u8(bl, bm);
-      vst1_u8(reinterpret_cast<uint8_t*>(output + i * n_bits), c);
+    for (size_t i = 0; i < area; ++i) {
+      for (size_t j = 0; j < full_words_in_depth; ++j) {
+        const auto v0 = vld1q_u8(input + i*blocks_in_depth*bits_per_word + j*bits_per_word +  0);
+        const auto v1 = vld1q_u8(input + i*blocks_in_depth*bits_per_word + j*bits_per_word + 16);
+        const auto l0 = vandq_u8(v0, vone);
+        const auto l1 = vandq_u8(v1, vone);
+        const auto m0 = vshrq_n_u8(v0, 1);
+        const auto m1 = vshrq_n_u8(v1, 1);
+        const auto ml0 = vmulq_u8(l0, coeff);
+        const auto ml1 = vmulq_u8(l1, coeff);
+        const auto mm0 = vmulq_u8(m0, coeff);
+        const auto mm1 = vmulq_u8(m1, coeff);
+        const auto al0 = vpadd_u8(vget_low_u8(ml0), vget_high_u8(ml0));
+        const auto al1 = vpadd_u8(vget_low_u8(ml1), vget_high_u8(ml1));
+        const auto am0 = vpadd_u8(vget_low_u8(mm0), vget_high_u8(mm0));
+        const auto am1 = vpadd_u8(vget_low_u8(mm1), vget_high_u8(mm1));
+        const auto bl = vpadd_u8(al0, al1);
+        const auto bm = vpadd_u8(am0, am1);
+        const auto c = vpadd_u8(bl, bm);
+        vst1_u8(reinterpret_cast<uint8_t*>(output + i*bits_per_input + j*area*bits_per_input), c);
+      }
     }
     Measurement::Stop();
-    return 0;
+    return;
   }
 #endif
 
 #ifdef USE_AVX
-  constexpr std::size_t SIMD_WIDTH = 32;
-  if ((input_depth % SIMD_WIDTH) == 0) {
-    const auto blocks = len / SIMD_WIDTH;
-    for (int i = 0; i < blocks; ++i) {
-      const auto a = _mm256_loadu_si256(reinterpret_cast<__m256i*>(input + i * SIMD_WIDTH));
-      const auto l = _mm256_movemask_epi8(_mm256_slli_epi16(a, 7));
-      const auto m = _mm256_movemask_epi8(_mm256_slli_epi16(a, 6));
-      output[i*2 + 0] = QUANTIZED_PACKED(l);
-      output[i*2 + 1] = QUANTIZED_PACKED(m);
+  if (bits_per_input == 2 && input_depth % bits_per_word == 0) {
+#pragma omp parallel for
+    for (size_t i = 0; i < area; ++i) {
+      for (size_t j = 0; j < full_words_in_depth; ++j) {
+        const auto a = _mm256_loadu_si256(reinterpret_cast<__m256i*>(input + i*blocks_in_depth*bits_per_word + j*bits_per_word));
+        const auto l = _mm256_movemask_epi8(_mm256_slli_epi16(a, 7));
+        const auto m = _mm256_movemask_epi8(_mm256_slli_epi16(a, 6));
+        output[i*bits_per_input + j*area*bits_per_input + 0] = QUANTIZED_PACKED(l);
+        output[i*bits_per_input + j*area*bits_per_input + 1] = QUANTIZED_PACKED(m);
+      }
     }
     Measurement::Stop();
-    return 0;
+    return;
   }
 #endif
 
-  for (int h = 0; h < input_height; ++h)
-      for (int w = 0; w < input_width; ++w) {
-          for (int d = 0; d < full_words_in_depth; ++d) {
-              output[current_word] = QUANTIZED_PACKED(0);
-              output[current_word + 1] = QUANTIZED_PACKED(0);
-              for (int d = 0; d < bits_per_word; ++d) {
-                for(int b = 0; b < bits_per_input; ++b)
-                  output[current_word + b] = QUANTIZED_PACKED(output[current_word + b].Raw() | ((input[input_index] >> b) & 1) << d);
-                input_index++;
-              }
-              current_word += bits_per_input;
-          }
+  for (size_t i = 0; i < area; ++i) {
+    for (size_t j = 0; j < full_words_in_depth; ++j) {
+      for(size_t b = 0; b < bits_per_input; ++b) {
+        QUANTIZED_PACKED tmp(0);
+        for (size_t d = 0; d < bits_per_word; ++d) {
+          QUANTIZED_PACKED::base_t in = input[i*input_depth + j*bits_per_word + d];
+          tmp |= QUANTIZED_PACKED(((in >> b) & 1) << d);
+        }
+        output[i*bits_per_input + j*area*bits_per_input + b] = tmp;
+      }
+    }
 
-          if(!remainder_bits_in_depth)
-              continue;
+    if(!remainder_bits_in_depth)
+      continue;
 
-          output[current_word] = QUANTIZED_PACKED(0);
-          output[current_word + 1] = QUANTIZED_PACKED(0);
-          for (int d = 0; d < remainder_bits_in_depth; ++d) {
-             for(int b = 0; b < bits_per_input; ++b)
-               output[current_word + b] = QUANTIZED_PACKED(output[current_word + b].Raw() | ((input[input_index] >> b) & 1) << d);
-             input_index++;
-          }
-          current_word += bits_per_input;
+    for(size_t b = 0; b < bits_per_input; ++b) {
+      QUANTIZED_PACKED tmp(0);
+      for (size_t d = 0; d < remainder_bits_in_depth; ++d) {
+        QUANTIZED_PACKED::base_t in = input[i*input_depth + full_words_in_depth*bits_per_word + d];
+        tmp |= QUANTIZED_PACKED(((in >> b) & 1) << d);
       }
+      output[i*bits_per_input + full_words_in_depth*area*bits_per_input + b] = tmp;
+    }
+  }
 
   Measurement::Stop();
 
-  return current_word;
+  return;
 }
 
 void pack_input_to_qwords(
diff --git a/dlk/python/dlk/templates/src/quantizer.cpp b/dlk/python/dlk/templates/src/quantizer.cpp
index ee8feb751..a33d4dcd8 100644
--- a/dlk/python/dlk/templates/src/quantizer.cpp
+++ b/dlk/python/dlk/templates/src/quantizer.cpp
@@ -120,7 +120,7 @@ void func_QTZ_linear_mid_tread_half(
     const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
     const TensorView<T_INT, MemoryLayout::Atom>& nbit,
     const TensorView<T_FLOAT, MemoryLayout::Atom>& max_value,
-    const TensorView<QUANTIZED_PACKED, MemoryLayout::HWChBCl>& output,
+    const TensorView<QUANTIZED_PACKED, MemoryLayout::ChHWBCl>& output,
     BYTE *temporary_buf) {
   Measurement::Start("QTZ_linear_mid_tread_half");
 

From 43b67edc607288156bef4cbb3bd360a3345352ca Mon Sep 17 00:00:00 2001
From: primenumber <prime@kmc.gr.jp>
Date: Tue, 28 Jan 2020 12:20:50 +0900
Subject: [PATCH 2/2] Just formatting

---
 dlk/python/dlk/templates/src/pack_input_to_qwords.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp b/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp
index 5342058c8..2727478b7 100644
--- a/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp
+++ b/dlk/python/dlk/templates/src/pack_input_to_qwords.cpp
@@ -89,7 +89,7 @@ void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_
 
   for (size_t i = 0; i < area; ++i) {
     for (size_t j = 0; j < full_words_in_depth; ++j) {
-      for(size_t b = 0; b < bits_per_input; ++b) {
+      for (size_t b = 0; b < bits_per_input; ++b) {
         QUANTIZED_PACKED tmp(0);
         for (size_t d = 0; d < bits_per_word; ++d) {
           QUANTIZED_PACKED::base_t in = input[i*input_depth + j*bits_per_word + d];
@@ -99,10 +99,10 @@ void pack_input(QUANTIZED_NOT_PACKED input[], size_t input_height, size_t input_
       }
     }
 
-    if(!remainder_bits_in_depth)
+    if (!remainder_bits_in_depth)
       continue;
 
-    for(size_t b = 0; b < bits_per_input; ++b) {
+    for (size_t b = 0; b < bits_per_input; ++b) {
       QUANTIZED_PACKED tmp(0);
       for (size_t d = 0; d < remainder_bits_in_depth; ++d) {
         QUANTIZED_PACKED::base_t in = input[i*input_depth + full_words_in_depth*bits_per_word + d];