PaddlePaddle · chenwhql · Feb 15, 2022 · Feb 9, 2022 · Feb 11, 2022 · Feb 11, 2022
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+#define MAX_RANK_SUPPORTED 6
 
 namespace paddle {
 namespace operators {
@@ -296,33 +299,3 @@ REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
                   ops::ExpandV2DoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandV2DoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::ExpandV2GradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_v2_grad,
-    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_v2_grad,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-#endif
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
@@ -91,259 +91,5 @@ inline std::vector<int> get_expand_shape(
     return ctx.Attr<std::vector<int>>("shape");
   }
 }
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::To32BitIndex;
-
-template <typename DeviceContext, typename T>
-class ExpandV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank, 1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2 op must be positive, "
-            "but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2 op must be less than "
-            "or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED, rank));
-    auto expand_shape = get_expand_shape(context);
-    auto shape_size = expand_shape.size();
-    PADDLE_ENFORCE_GE(
-        shape_size, rank,
-        platform::errors::InvalidArgument(
-            "The number (%d) of elements of 'shape' for expand_v2 op must be "
-            "greater than or equal to the rank (%d) of the input 'X'.",
-            shape_size, rank));
-    PADDLE_ENFORCE_LE(
-        shape_size, MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number (%d) of elements of 'shape' for expand_v2 op must be "
-            "less than or equal to %d.",
-            shape_size, MAX_RANK_SUPPORTED));
-    rank = std::max(rank, static_cast<int>(shape_size));
-    switch (rank) {
-      case 1:
-        Expand<1>(context);
-        break;
-      case 2:
-        Expand<2>(context);
-        break;
-      case 3:
-        Expand<3>(context);
-        break;
-      case 4:
-        Expand<4>(context);
-        break;
-      case 5:
-        Expand<5>(context);
-        break;
-      case 6:
-        Expand<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Expand(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
-
-    auto in_dims = in0->dims();
-    auto expand_shape = get_expand_shape(context);
-    auto vec_in_dims = framework::vectorize<int>(in_dims);
-    auto diff = expand_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(expand_shape[i], 0,
-                        platform::errors::InvalidArgument(
-                            "The expanded size cannot be zero."));
-      if (i < diff) {
-        PADDLE_ENFORCE_GT(
-            expand_shape[i], 0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_v2 op.",
-                expand_shape[i]));
-        repeat_times[i] = expand_shape[i];
-      } else if (expand_shape[i] > 0) {
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i], expand_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_v2 op.",
-                  vec_in_dims[i], expand_shape[i]));
-          repeat_times[i] = 1;
-        } else {
-          repeat_times[i] = expand_shape[i];
-        }
-      } else {
-        PADDLE_ENFORCE_EQ(
-            expand_shape[i], -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                expand_shape[i]));
-        repeat_times[i] = 1;
-      }
-    }
-
-    auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      bcast_dims[i] = repeat_times[i];
-    }
-
-    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
-    framework::DDim out_dims(new_in_dims);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      out_dims[i] *= repeat_times[i];
-    }
-
-    out0->Resize(out_dims);
-    auto x = EigenTensor<T, Rank>::From(*in0, new_in_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    // use 32-bit index to speed up
-    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
-    if (use_32bit_index) {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
-          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
-    } else {
-      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
-                                                                   bcast_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto expand_shape = get_expand_shape(context);
-    auto x_dims = in0->dims();
-    auto vec_in_dims = framework::vectorize<int>(x_dims);
-    auto diff = expand_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    // 1. reshape_dims_vec is the broadcast parameter.
-    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
-    //    each dimension expanded, the gradients should be summed to original
-    //    size.
-    std::vector<int> repeat_times(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      if (expand_shape[i] < 0) {
-        repeat_times[i] = 1;
-      } else {
-        repeat_times[i] = expand_shape[i] / vec_in_dims[i];
-      }
-    }
-    std::vector<int> reshape_dims_vec;
-    std::vector<int> reduce_dims_vec;
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      reduce_dims_vec.push_back(reshape_dims_vec.size());
-      reshape_dims_vec.push_back(repeat_times[i]);
-      reshape_dims_vec.push_back(vec_in_dims[i]);
-    }
-
-    int dims = reduce_dims_vec.size();
-
-    bool just_copy = true;
-    for (size_t i = 0; i < repeat_times.size(); i++) {
-      if (repeat_times[i] != 1) {
-        just_copy = false;
-        break;
-      }
-    }
-    // no need reduce, just copy
-    if (just_copy) {
-      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-      out0->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
-                            out0);
-    } else {
-      PADDLE_ENFORCE_GE(dims, 1,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_v2_grad op must be greater than or "
-                            "equal to 1, but the value received is %d.",
-                            dims));
-      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
-                        platform::errors::InvalidArgument(
-                            "The rank of the input 'Out@GRAD' for "
-                            "expand_v2_grad op must be less than or equal "
-                            "to %d, but the value received is %d.",
-                            MAX_RANK_SUPPORTED, dims));
-      switch (dims) {
-        case 1:
-          ExpandBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 2:
-          ExpandBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 3:
-          ExpandBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 4:
-          ExpandBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 5:
-          ExpandBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        case 6:
-          ExpandBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support tensor with rank being between 1 and 6. But "
-              "received tensor's rank = %d.",
-              dims));
-      }
-    }
-  }
-
- protected:
-  template <int Dims>
-  void ExpandBackward(const framework::ExecutionContext& context,
-                      const std::vector<int>& reshape_dims_vec,
-                      const std::vector<int>& reduce_dims_vec) const {
-    size_t reshape_size = reshape_dims_vec.size();
-    size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    out0->mutable_data<T>(context.GetPlace());
-    auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
-    for (size_t i = 0; i < reshape_size; ++i) {
-      reshape_dims[i] = reshape_dims_vec[i];
-    }
-    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
-    for (size_t i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = reduce_dims_vec[i];
-    }
-    auto out_grad = EigenVector<T>::Flatten(*in0);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
-        place, x_grad, out_grad, reduce_dims, reshape_dims);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {

diff --git a/paddle/fluid/operators/expand_v2_op_xpu.cc b/paddle/fluid/operators/expand_v2_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/operators/expand_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {

diff --git a/paddle/pten/core/compat/op_utils.h b/paddle/pten/core/compat/op_utils.h
@@ -45,6 +45,8 @@ const std::unordered_set<std::string> deprecated_op_names({"flatten",
                                                            "mean",
                                                            "reshape",
                                                            "reshape_grad",
+                                                           "expand",
+                                                           "expand_grad",
                                                            "sum"});
 
 class DefaultKernelSignatureMap {

diff --git a/paddle/pten/kernels/cpu/expand_grad_kernel.cc b/paddle/pten/kernels/cpu/expand_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/expand_grad_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/impl/expand_grad_kernel_impl.h"
+
+PT_REGISTER_KERNEL(expand_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::ExpandGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/cpu/expand_kernel.cc b/paddle/pten/kernels/cpu/expand_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/expand_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/impl/expand_kernel_impl.h"
+
+PT_REGISTER_KERNEL(expand,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::ExpandKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}