diff --git a/dlk/python/dlk/templates/CMakeLists.txt b/dlk/python/dlk/templates/CMakeLists.txt index d40edfd89..c21181cf8 100644 --- a/dlk/python/dlk/templates/CMakeLists.txt +++ b/dlk/python/dlk/templates/CMakeLists.txt @@ -65,7 +65,7 @@ elseif(USE_NEON) list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/quantized_conv2d_tiling.cpp) list(APPEND SRC_LIB_ALL src/func/impl/arm_neon/pop_count.cpp) elseif(USE_AVX) - list(APPEND SRC_LIB_ALL src/func/generic/batch_normalization.cpp) + list(APPEND SRC_LIB_ALL src/func/x86_avx/batch_normalization.cpp) list(APPEND SRC_LIB_ALL src/func/impl/x86_avx/quantized_conv2d_tiling.cpp) list(APPEND SRC_LIB_ALL src/func/impl/generic/pop_count.cpp) else() diff --git a/dlk/python/dlk/templates/Makefile.tpl b/dlk/python/dlk/templates/Makefile.tpl index ec30517ea..7ba7b4283 100644 --- a/dlk/python/dlk/templates/Makefile.tpl +++ b/dlk/python/dlk/templates/Makefile.tpl @@ -67,7 +67,7 @@ LIB_X86_SRC := \ LIB_X86_OBJ := $(patsubst %.cpp, %.o, $(LIB_X86_SRC)) LIB_X86_AVX_SRC := \ - $(SRC_DIR)/func/generic/batch_normalization.cpp \ + $(SRC_DIR)/func/x86_avx/batch_normalization.cpp \ $(SRC_DIR)/func/impl/x86_avx/quantized_conv2d_tiling.cpp \ $(SRC_DIR)/func/impl/generic/pop_count.cpp LIB_X86_AVX_OBJ := $(patsubst %.cpp, %.o, $(LIB_X86_AVX_SRC)) diff --git a/dlk/python/dlk/templates/src/func/arm_neon/batch_normalization.cpp b/dlk/python/dlk/templates/src/func/arm_neon/batch_normalization.cpp index 16ba9230b..b079c6825 100644 --- a/dlk/python/dlk/templates/src/func/arm_neon/batch_normalization.cpp +++ b/dlk/python/dlk/templates/src/func/arm_neon/batch_normalization.cpp @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include +#include #include "global.h" #include "func/batch_normalization.h" @@ -21,6 +22,9 @@ limitations under the License. #include +static const auto scale = std::make_unique(MAX_IN_C); +static const auto shift = std::make_unique(MAX_IN_C); + void func_BatchNormalization(const TensorView& input, const TensorView& gamma, const TensorView& beta, @@ -35,9 +39,6 @@ void func_BatchNormalization(const TensorView& inpu T_UINT out_width = out_shape[2]; T_UINT out_depth = out_shape[3]; - // temporary fix: will be replaced by pre-allocated one - T_FLOAT *scale = new float[out_depth]; - T_FLOAT *shift = new float[out_depth]; T_UINT size = out_height * out_width; float32x4_t eps_batch = vdupq_n_f32(epsilon); @@ -69,27 +70,15 @@ void func_BatchNormalization(const TensorView& inpu // TODO(nlpng): remove use of OpenMP library #pragma omp parallel for for (T_UINT f = 0; f < size; f++) { - T_FLOAT *in_temp = input.data() + f * out_depth; T_FLOAT *out_temp = output.data() + f * out_depth; T_UINT d = 0; for (; d + 3 < out_depth; d += 4) { -#ifdef AARCH32 - asm volatile("vldmia %0, {d16,d17} \t\n" // q8(d16,d17) scale - "vldmia %1, {d18,d19} \t\n" // q9(d18,d19) shift - "vldmia %2, {d20,d21} \t\n" // q10(d20,d21) input - "vmla.f32 q9, q10, q8 \t\n" - "vstmia %3, {d18,d19} \t\n" - : - : "r"(&scale[d]), "r"(&shift[d]), "r"(in_temp), "r"(out_temp) - : "memory", "q8", "q9", "q10"); -#else - const auto scale_v = vld1q_f32(scale + d); - const auto shift_v = vld1q_f32(shift + d); + const auto scale_v = vld1q_f32(scale.get() + d); + const auto shift_v = vld1q_f32(shift.get() + d); const auto in_v = vld1q_f32(in_temp); vst1q_f32(out_temp, vmlaq_f32(shift_v, in_v, scale_v)); -#endif in_temp += 4; out_temp += 4; } @@ -99,8 +88,5 @@ void func_BatchNormalization(const TensorView& inpu } } - delete[] scale; - delete[] shift; - Measurement::Stop(); } diff --git a/dlk/python/dlk/templates/src/func/generic/batch_normalization.cpp b/dlk/python/dlk/templates/src/func/generic/batch_normalization.cpp index 38f128291..05c374df9 100644 --- a/dlk/python/dlk/templates/src/func/generic/batch_normalization.cpp +++ b/dlk/python/dlk/templates/src/func/generic/batch_normalization.cpp @@ -14,11 +14,15 @@ limitations under the License. ==============================================================================*/ #include +#include #include "global.h" #include "func/batch_normalization.h" #include "time_measurement.h" +static const auto scale = std::make_unique(MAX_IN_C); +static const auto shift = std::make_unique(MAX_IN_C); + void func_BatchNormalization(const TensorView& input, const TensorView& gamma, const TensorView& beta, @@ -32,10 +36,6 @@ void func_BatchNormalization(const TensorView& inpu const unsigned out_width = output.get_shape()[2]; const unsigned out_depth = output.get_shape()[3]; - // temporary fix: will be replaced by pre-allocated one - T_FLOAT *scale = new float[out_depth]; - T_FLOAT *shift = new float[out_depth]; - for (T_UINT i = 0; i < out_depth; i++) scale[i] = gamma(i) * (1.0 / std::sqrt(variance(i) + epsilon)); @@ -50,8 +50,5 @@ void func_BatchNormalization(const TensorView& inpu } } - delete[] scale; - delete[] shift; - Measurement::Stop(); } diff --git a/dlk/python/dlk/templates/src/func/x86_avx/batch_normalization.cpp b/dlk/python/dlk/templates/src/func/x86_avx/batch_normalization.cpp new file mode 100644 index 000000000..09e764e87 --- /dev/null +++ b/dlk/python/dlk/templates/src/func/x86_avx/batch_normalization.cpp @@ -0,0 +1,66 @@ +/* Copyright 2018 The Blueoil Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "global.h" +#include "func/batch_normalization.h" +#include "time_measurement.h" + +#include + +static const auto scale = std::make_unique(MAX_IN_C); +static const auto shift = std::make_unique(MAX_IN_C); + +void func_BatchNormalization(const TensorView& input, + const TensorView& gamma, + const TensorView& beta, + const TensorView& mean, + const TensorView& variance, + T_FLOAT epsilon, + const TensorView& output) { + Measurement::Start("BatchNorm"); + + const unsigned out_height = output.get_shape()[1]; + const unsigned out_width = output.get_shape()[2]; + const unsigned out_depth = output.get_shape()[3]; + + for (T_UINT i = 0; i < out_depth; i++) { + scale[i] = gamma(i) * (1.0 / std::sqrt(variance(i) + epsilon)); + shift[i] = beta(i) - (scale[i] * mean(i)); + } + + std::size_t size = out_height * out_width; +#pragma omp parallel for + for (std::size_t f = 0; f < size; ++f) { + std::size_t d; + for (d = 0; d + 7 < out_depth; d += 8) { + const auto index = f * out_depth + d; + const auto vscale = _mm256_loadu_ps(scale.get() + d); + const auto vshift = _mm256_loadu_ps(shift.get() + d); + const auto vinput = _mm256_loadu_ps(input.data() + index); + const auto res = _mm256_fmadd_ps(vinput, vscale, vshift); + _mm256_storeu_ps(output.data() + index, res); + } + + for (; d < out_depth; ++d) { + const auto index = f * out_depth + d; + output.data()[index] = input.data()[index] * scale[d] + shift[d]; + } + } + + Measurement::Stop(); +}