Optimize batchnorm (#469)

blue-oil · Oct 1, 2019 · 2ee4a41 · 2ee4a41
1 parent 502e844
commit 2ee4a41
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 29 deletions.
diff --git a/dlk/python/dlk/templates/CMakeLists.txt b/dlk/python/dlk/templates/CMakeLists.txt
@@ -65,7 +65,7 @@ elseif(USE_NEON)
     list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/quantized_conv2d_tiling.cpp)
     list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/pop_count.cpp)
 elseif(USE_AVX)
-    list(APPEND SRC_LIB_ALL src/func/generic/batch_normalization.cpp)
+    list(APPEND SRC_LIB_ALL src/func/x86_avx/batch_normalization.cpp)
     list(APPEND SRC_LIB_ALL src/func/impl/x86_avx/quantized_conv2d_tiling.cpp)
     list(APPEND SRC_LIB_ALL src/func/impl/generic/pop_count.cpp)
 else()

diff --git a/dlk/python/dlk/templates/Makefile.tpl b/dlk/python/dlk/templates/Makefile.tpl
@@ -67,7 +67,7 @@ LIB_X86_SRC := \
 LIB_X86_OBJ := $(patsubst %.cpp, %.o, $(LIB_X86_SRC))
 
 LIB_X86_AVX_SRC := \
-    $(SRC_DIR)/func/generic/batch_normalization.cpp \
+    $(SRC_DIR)/func/x86_avx/batch_normalization.cpp \
     $(SRC_DIR)/func/impl/x86_avx/quantized_conv2d_tiling.cpp \
     $(SRC_DIR)/func/impl/generic/pop_count.cpp
 LIB_X86_AVX_OBJ := $(patsubst %.cpp, %.o, $(LIB_X86_AVX_SRC))

diff --git a/dlk/python/dlk/templates/src/func/arm_neon/batch_normalization.cpp b/dlk/python/dlk/templates/src/func/arm_neon/batch_normalization.cpp
@@ -14,13 +14,17 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cmath>
+#include <memory>
 
 #include "global.h"
 #include "func/batch_normalization.h"
 #include "time_measurement.h"
 
 #include <arm_neon.h>
 
+static const auto scale = std::make_unique<float[]>(MAX_IN_C);
+static const auto shift = std::make_unique<float[]>(MAX_IN_C);
+
 void func_BatchNormalization(const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
     const TensorView<T_FLOAT, MemoryLayout::C>& gamma,
     const TensorView<T_FLOAT, MemoryLayout::C>& beta,
@@ -35,9 +39,6 @@ void func_BatchNormalization(const TensorView<T_FLOAT, MemoryLayout::NHWC>& inpu
   T_UINT out_width = out_shape[2];
   T_UINT out_depth = out_shape[3];
 
-  // temporary fix: will be replaced by pre-allocated one
-  T_FLOAT *scale = new float[out_depth];
-  T_FLOAT *shift = new float[out_depth];
   T_UINT size = out_height * out_width;
 
   float32x4_t eps_batch = vdupq_n_f32(epsilon);
@@ -69,27 +70,15 @@ void func_BatchNormalization(const TensorView<T_FLOAT, MemoryLayout::NHWC>& inpu
 // TODO(nlpng): remove use of OpenMP library
 #pragma omp parallel for
   for (T_UINT f = 0; f < size; f++) {
-
     T_FLOAT *in_temp = input.data() + f * out_depth;
     T_FLOAT *out_temp = output.data() + f * out_depth;
 
     T_UINT d = 0;
     for (; d + 3 < out_depth; d += 4) {
-#ifdef AARCH32
-      asm volatile("vldmia %0, {d16,d17}    \t\n" // q8(d16,d17) scale
-                   "vldmia %1, {d18,d19}    \t\n" // q9(d18,d19) shift
-                   "vldmia %2, {d20,d21}    \t\n" // q10(d20,d21) input
-                   "vmla.f32 q9, q10, q8    \t\n"
-                   "vstmia %3, {d18,d19}    \t\n"
-                   :
-                   : "r"(&scale[d]), "r"(&shift[d]), "r"(in_temp), "r"(out_temp)
-                   : "memory", "q8", "q9", "q10");
-#else
-      const auto scale_v = vld1q_f32(scale + d);
-      const auto shift_v = vld1q_f32(shift + d);
+      const auto scale_v = vld1q_f32(scale.get() + d);
+      const auto shift_v = vld1q_f32(shift.get() + d);
       const auto in_v = vld1q_f32(in_temp);
       vst1q_f32(out_temp, vmlaq_f32(shift_v, in_v, scale_v));
-#endif
       in_temp += 4;
       out_temp += 4;
     }
@@ -99,8 +88,5 @@ void func_BatchNormalization(const TensorView<T_FLOAT, MemoryLayout::NHWC>& inpu
     }
   }
 
-  delete[] scale;
-  delete[] shift;
-
   Measurement::Stop();
 }
diff --git a/dlk/python/dlk/templates/src/func/generic/batch_normalization.cpp b/dlk/python/dlk/templates/src/func/generic/batch_normalization.cpp
@@ -14,11 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cmath>
+#include <memory>
 
 #include "global.h"
 #include "func/batch_normalization.h"
 #include "time_measurement.h"
 
+static const auto scale = std::make_unique<float[]>(MAX_IN_C);
+static const auto shift = std::make_unique<float[]>(MAX_IN_C);
+
 void func_BatchNormalization(const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
     const TensorView<T_FLOAT, MemoryLayout::C>& gamma,
     const TensorView<T_FLOAT, MemoryLayout::C>& beta,
@@ -32,10 +36,6 @@ void func_BatchNormalization(const TensorView<T_FLOAT, MemoryLayout::NHWC>& inpu
   const unsigned out_width = output.get_shape()[2];
   const unsigned out_depth = output.get_shape()[3];
 
-  // temporary fix: will be replaced by pre-allocated one
-  T_FLOAT *scale = new float[out_depth];
-  T_FLOAT *shift = new float[out_depth];
-
   for (T_UINT i = 0; i < out_depth; i++)
     scale[i] = gamma(i) * (1.0 / std::sqrt(variance(i) + epsilon));
 
@@ -50,8 +50,5 @@ void func_BatchNormalization(const TensorView<T_FLOAT, MemoryLayout::NHWC>& inpu
     }
   }
 
-  delete[] scale;
-  delete[] shift;
-
   Measurement::Stop();
 }
diff --git a/dlk/python/dlk/templates/src/func/x86_avx/batch_normalization.cpp b/dlk/python/dlk/templates/src/func/x86_avx/batch_normalization.cpp
@@ -0,0 +1,66 @@
+/* Copyright 2018 The Blueoil Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <memory>
+
+#include "global.h"
+#include "func/batch_normalization.h"
+#include "time_measurement.h"
+
+#include <x86intrin.h>
+
+static const auto scale = std::make_unique<float[]>(MAX_IN_C);
+static const auto shift = std::make_unique<float[]>(MAX_IN_C);
+
+void func_BatchNormalization(const TensorView<T_FLOAT, MemoryLayout::NHWC>& input,
+    const TensorView<T_FLOAT, MemoryLayout::C>& gamma,
+    const TensorView<T_FLOAT, MemoryLayout::C>& beta,
+    const TensorView<T_FLOAT, MemoryLayout::C>& mean,
+    const TensorView<T_FLOAT, MemoryLayout::C>& variance,
+    T_FLOAT epsilon,
+    const TensorView<T_FLOAT, MemoryLayout::NHWC>& output) {
+  Measurement::Start("BatchNorm");
+
+  const unsigned out_height = output.get_shape()[1];
+  const unsigned out_width = output.get_shape()[2];
+  const unsigned out_depth = output.get_shape()[3];
+
+  for (T_UINT i = 0; i < out_depth; i++) {
+    scale[i] = gamma(i) * (1.0 / std::sqrt(variance(i) + epsilon));
+    shift[i] = beta(i) - (scale[i] * mean(i));
+  }
+
+  std::size_t size = out_height * out_width;
+#pragma omp parallel for
+  for (std::size_t f = 0; f < size; ++f) {
+    std::size_t d;
+    for (d = 0; d + 7 < out_depth; d += 8) {
+      const auto index = f * out_depth + d;
+      const auto vscale = _mm256_loadu_ps(scale.get() + d);
+      const auto vshift = _mm256_loadu_ps(shift.get() + d);
+      const auto vinput = _mm256_loadu_ps(input.data() + index);
+      const auto res = _mm256_fmadd_ps(vinput, vscale, vshift);
+      _mm256_storeu_ps(output.data() + index, res);
+    }
+
+    for (; d < out_depth; ++d) {
+      const auto index = f * out_depth + d;
+      output.data()[index] = input.data()[index] * scale[d] + shift[d];
+    }
+  }
+
+  Measurement::Stop();
+}