add elementwise_mod on x86, increment on host; test=develop

PaddlePaddle · Feb 5, 2021 · b536030 · b536030
1 parent 36afd0b
commit b536030
Show file tree

Hide file tree

Showing 14 changed files with 186 additions and 129 deletions.
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
@@ -127,7 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       sequence_softmax.cc
       norm.cc
       topk.cc
-      increment.cc
       pad2d.cc
       negative.cc
       beam_search.cc

diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h
@@ -40,7 +40,6 @@
 #include "lite/backends/arm/math/gemm_s8.h"
 #include "lite/backends/arm/math/gemv_arm_int8.h"
 #include "lite/backends/arm/math/im2sequence.h"
-#include "lite/backends/arm/math/increment.h"
 #include "lite/backends/arm/math/interpolate.h"
 #include "lite/backends/arm/math/layout.h"
 #include "lite/backends/arm/math/lrn.h"

diff --git a/lite/backends/arm/math/increment.cc b/lite/backends/arm/math/increment.cc
diff --git a/lite/backends/arm/math/increment.h b/lite/backends/arm/math/increment.h
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
@@ -96,7 +96,7 @@ add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS
 add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} increment_compute_host)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm)

diff --git a/lite/kernels/arm/increment_compute.cc b/lite/kernels/arm/increment_compute.cc
@@ -12,47 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/increment_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void IncrementCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::IncrementParam>();
-
-  int total_num = param.X->dims().production();
-  if (param.X->precision() == PRECISION(kFloat)) {
-    const auto* x_data = param.X->data<float>();
-    auto* o_data = param.Out->mutable_data<float>();
-    lite::arm::math::increment(x_data, total_num, param.step, o_data, &ctx);
-  } else if (param.X->precision() == PRECISION(kInt64)) {
-    const auto* x_data = param.X->data<int64_t>();
-    auto* o_data = param.Out->mutable_data<int64_t>();
-    lite::arm::math::increment(x_data, total_num, param.step, o_data, &ctx);
-  } else if (param.X->precision() == PRECISION(kInt32)) {
-    const auto* x_data = param.X->data<int32_t>();
-    auto* o_data = param.Out->mutable_data<int32_t>();
-    lite::arm::math::increment(x_data, total_num, param.step, o_data, &ctx);
-  } else {
-    LOG(FATAL) << "unsupport input type "
-               << PrecisionToStr(param.X->precision());
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "lite/kernels/host/increment_compute.h"
 
 REGISTER_LITE_KERNEL(increment,
                      kARM,
                      kAny,
                      kNCHW,
-                     paddle::lite::kernels::arm::IncrementCompute,
+                     paddle::lite::kernels::host::IncrementCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})

diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
@@ -53,6 +53,7 @@ add_kernel(box_coder_compute_host Host basic SRCS box_coder_compute.cc DEPS ${li
 add_kernel(gather_compute_host Host extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_host)
 add_kernel(gather_nd_compute_host Host extra SRCS gather_nd_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(gather_tree_compute_host Host extra SRCS gather_tree_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(increment_compute_host Host extra SRCS increment_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(pad3d_compute_host Host extra SRCS pad3d_compute.cc DEPS ${lite_kernel_deps} math_host)
 add_kernel(select_input_compute_host Host extra SRCS select_input_compute.cc DEPS ${lite_kernel_deps} math_host)
 add_kernel(tensor_array_to_tensor_compute_host Host extra SRCS tensor_array_to_tensor_compute.cc DEPS ${lite_kernel_deps} math_host)

diff --git a/lite/kernels/host/expand_compute.cc b/lite/kernels/host/expand_compute.cc
@@ -80,7 +80,7 @@ void ExpandCompute<T, PType>::Run() {
 
 using expand_float =
     paddle::lite::kernels::host::ExpandCompute<float, PRECISION(kFloat)>;
-REGISTER_LITE_KERNEL(expand, kHost, kFloat, kAny, expand_float, def)
+REGISTER_LITE_KERNEL(expand, kHost, kFloat, kAny, expand_float, float32)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
@@ -100,8 +100,8 @@ REGISTER_LITE_KERNEL(expand, kHost, kFloat, kAny, expand_float, def)
     .Finalize();
 
 using expand_int32 =
-    paddle::lite::kernels::host::ExpandCompute<int, PRECISION(kInt32)>;
-REGISTER_LITE_KERNEL(expand, kHost, kInt32, kAny, expand_int32, def)
+    paddle::lite::kernels::host::ExpandCompute<int, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(expand, kHost, kFloat, kAny, expand_int32, int32)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kInt32),

diff --git a/lite/kernels/host/increment_compute.cc b/lite/kernels/host/increment_compute.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/increment_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <class T>
+void increment(const T* input, const int n, const T step, T* out) {
+  for (int i = 0; i < n; i++) {
+    out[i] = input[i] + step;
+  }
+}
+
+void IncrementCompute::Run() {
+  auto& param = this->Param<operators::IncrementParam>();
+
+  int total_num = param.X->numel();
+  switch (param.X->precision()) {
+    case PRECISION(kFloat): {
+      const auto* x_data = param.X->data<float>();
+      auto* o_data = param.Out->mutable_data<float>();
+      float step = static_cast<float>(param.step);
+      increment(x_data, total_num, step, o_data);
+      break;
+    }
+    case PRECISION(kInt64): {
+      const auto* x_data = param.X->data<int64_t>();
+      auto* o_data = param.Out->mutable_data<int64_t>();
+      int64_t step = static_cast<int64_t>(param.step);
+      increment(x_data, total_num, step, o_data);
+      break;
+    }
+    case PRECISION(kInt32): {
+      const auto* x_data = param.X->data<int32_t>();
+      auto* o_data = param.Out->mutable_data<int32_t>();
+      int32_t step = static_cast<int32_t>(param.step);
+      increment(x_data, total_num, step, o_data);
+      break;
+    }
+    default:
+      LOG(FATAL) << "unsupport input type "
+                 << PrecisionToStr(param.X->precision());
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(increment,
+                     kHost,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::host::IncrementCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/arm/increment_compute.h → lite/kernels/host/increment_compute.h b/lite/kernels/arm/increment_compute.h → lite/kernels/host/increment_compute.h
@@ -13,17 +13,15 @@
 // limitations under the License.
 
 #pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-class IncrementCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+class IncrementCompute : public KernelLite<TARGET(kHost), PRECISION(kAny)> {
  public:
   void Run() override;
 
@@ -32,7 +30,7 @@ class IncrementCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  private:
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/x86/elementwise_compute.cc b/lite/kernels/x86/elementwise_compute.cc
@@ -82,3 +82,25 @@ REGISTER_LITE_KERNEL(
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mod,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::ElementwiseModCompute<int32_t>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mod,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::ElementwiseModCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/x86/elementwise_compute.h b/lite/kernels/x86/elementwise_compute.h
@@ -45,6 +45,15 @@ struct FloorDivFunctor {
   }
 };
 
+template <typename T>
+struct ModFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = a % b;
+    if ((res != 0) && ((res < 0) != (b < 0))) res += b;
+    return res;
+  }
+};
+
 template <typename T>
 class ElementwiseSubCompute
     : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
@@ -117,6 +126,22 @@ class ElementwiseFloorDivCompute
   virtual ~ElementwiseFloorDivCompute() = default;
 };
 
+template <typename T>
+class ElementwiseModCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    param.Out->template mutable_data<T>();
+    ElementwiseComputeEx<ModFunctor<T>, lite::TargetType::kX86, T>(
+        context, param.X, param.Y, param.axis, ModFunctor<T>(), param.Out);
+  }
+
+  virtual ~ElementwiseModCompute() = default;
+};
+
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite