PaddlePaddle · wanghuancoder · Nov 17, 2023 · Nov 11, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
@@ -235,7 +235,8 @@ void embedding_grad_impl(const Tensor& x,
   VLOG(6) << "embedding_grad API kernel key: [" << kernel_key.backend() << ", "
           << kernel_key.layout() << ", " << kernel_data_type << "]";
 
-  if (phi::DenseTensor::classof(weight.impl().get())) {
+  if (phi::DenseTensor::classof(weight.impl().get()) ||
+      phi::distributed::DistTensor::classof(weight.impl().get())) {
     std::string kernel_name =
         sparse ? "embedding_sparse_grad" : "embedding_grad";
     auto kernel_result =
@@ -248,6 +249,108 @@ void embedding_grad_impl(const Tensor& x,
     auto* dev_ctx = GetDeviceContextByBackend(
         kernel_result.has_fallback_cpu ? Backend::CPU : kernel_key.backend());
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+    bool run_auto_parallel = AllInputsAreDistTensor(x, weight, out_grad);
+    // Auto Parallel condition
+    if (run_auto_parallel) {
+      bool rank_is_in_current_mesh = true;
+      auto mesh =
+          std::static_pointer_cast<phi::distributed::DistTensor>(x.impl())
+              ->dist_attr()
+              .process_mesh();
+      rank_is_in_current_mesh = phi::distributed::IsCurRankInMesh(mesh);
+
+      // 1. InferSpmd (Infer DistAttr of Inputs&Outputs)
+      auto meta_dist_input_x = MakeDistMetaTensor(*x.impl());
+      auto meta_dist_input_weight = MakeDistMetaTensor(*weight.impl());
+      auto meta_dist_input_out_grad = MakeDistMetaTensor(*out_grad.impl());
+      auto spmd_info = phi::distributed::VariadicReplicatedInferSpmdDynamic(
+          meta_dist_input_weight, meta_dist_input_x, meta_dist_input_out_grad);
+
+      // 2. Create Temporary Output & Prepare Dist and Dense Output
+      std::shared_ptr<phi::distributed::DistTensor> shared_dist_out =
+          CreateKernelDistOutput(weight_grad, !rank_is_in_current_mesh);
+      phi::distributed::DistTensor* dist_out = shared_dist_out.get();
+      phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value();
+      if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {
+        *dense_out = phi::DenseTensor(
+            std::make_shared<phi::Allocation>(
+                nullptr, 0, phi::distributed::GetDefaultPlace()),
+            phi::DenseTensorMeta());
+      }
+
+      // 3. Infer DistTensor's Global Shape
+      phi::MetaTensor meta_dist_out(dist_out);
+      UnchangedInferMeta(MakeMetaTensor(*weight.impl()), &meta_dist_out);
+
+      // 4. Set Output Dist Attr For Default Impl
+      auto current_process_mesh =
+          paddle::holds_alternative<phi::distributed::TensorDistAttr>(
+              spmd_info.first[0])
+              ? paddle::get<0>(spmd_info.first[0]).process_mesh()
+              : paddle::get<1>(spmd_info.first[0]).at(0).process_mesh();
+      SetReplicatedDistAttrForOutput(dist_out, current_process_mesh);
+
+      if (rank_is_in_current_mesh) {
+        // 5. Reshard Input
+        auto dist_input_weight =
+            ReshardApiInputToKernelInput(dev_ctx, weight, spmd_info.first[0]);
+        auto dist_input_x =
+            ReshardApiInputToKernelInput(dev_ctx, x, spmd_info.first[1]);
+        auto dist_input_out_grad =
+            ReshardApiInputToKernelInput(dev_ctx, out_grad, spmd_info.first[2]);
+
+        // 6. PrepareData (DataTransform & Prepare Dense Input)
+        dist_input_weight = PrepareDataForDistTensor(
+            dist_input_weight,
+            GetKernelInputArgDef(kernel.InputAt(0), kernel_key.backend()),
+            {},
+            kernel_result.is_stride_kernel);
+        auto input_weight = &dist_input_weight->value();
+
+        dist_input_x = PrepareDataForDistTensor(
+            dist_input_x,
+            GetKernelInputArgDef(kernel.InputAt(1), kernel_key.backend()),
+            {},
+            kernel_result.is_stride_kernel);
+        auto input_x = &dist_input_x->value();
+
+        dist_input_out_grad = PrepareDataForDistTensor(
+            dist_input_out_grad,
+            GetKernelInputArgDef(kernel.InputAt(2), kernel_key.backend()),
+            {},
+            kernel_result.is_stride_kernel);
+        auto input_out_grad = &dist_input_out_grad->value();
+
+        // 7. Infer Local DenseTensor Meta
+        phi::MetaTensor meta_dense_out(dense_out);
+        phi::EmbeddingGradInferMeta(MakeMetaTensor(*input_x),
+                                    MakeMetaTensor(*input_weight),
+                                    &meta_dense_out);
+
+        // 8. DenseTensor Kernel Call
+        using kernel_signature = void (*)(const phi::DeviceContext&,
+                                          const phi::DenseTensor&,
+                                          const phi::DenseTensor&,
+                                          const phi::DenseTensor&,
+                                          int64_t,
+                                          phi::DenseTensor*);
+        auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+        (*kernel_fn)(*dev_ctx,
+                     *input_x,
+                     *input_weight,
+                     *input_out_grad,
+                     padding_idx,
+                     dense_out);
+      }
+      // 9. Reshard Kernel Output to API output
+      ReshardKernelOutputToApiOutput(dev_ctx, shared_dist_out, weight_grad);
+
+      // 10. Return
+      return;
+    }
+#endif  // PADDLE_WITH_DISTRIBUTE
+
     auto input_x = PrepareData(x, kernel.InputAt(0), {}, false);
     auto input_weight = PrepareData(weight, kernel.InputAt(1), {}, false);
     auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {}, false);

diff --git a/test/auto_parallel/semi_auto_parallel_for_embedding_grad.py b/test/auto_parallel/semi_auto_parallel_for_embedding_grad.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestCustomEmbeddingGradApiForSemiAutoParallel:
+    def __init__(self):
+        self._dtype = os.getenv("dtype")
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(np1, np2, rtol=1e-05, verbose=True)
+
+    def test_body(self, x_shape, w_shape, x_specs, w_specs):
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
+        x_np = np.random.randint(0, 10, size=x_shape)
+        w_np = np.random.random(size=w_shape).astype(self._dtype)
+
+        x = paddle.to_tensor(x_np)
+        w = paddle.to_tensor(w_np)
+        x.stop_gradient = False
+        w.stop_gradient = False
+
+        x_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=x_specs)
+        w_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=w_specs)
+
+        dist_x = dist.shard_tensor(x_np, dist_attr=x_dist_attr)
+        dist_w = dist.shard_tensor(w_np, dist_attr=w_dist_attr)
+        dist_x.stop_gradient = False
+        dist_w.stop_gradient = False
+
+        out = paddle.nn.functional.embedding(x, weight=w)
+        dist_out = paddle.nn.functional.embedding(dist_x, weight=dist_w)
+        self.check_tensor_eq(out, dist_out)
+
+        out.backward()
+        dist_out.backward()
+        self.check_tensor_eq(w.grad, dist_w.grad)
+
+        return dist_out, dist_w.grad
+
+    def test_non_shard(self):
+        self.test_body(
+            x_shape=[12, 16],
+            w_shape=[10, 4],
+            x_specs=[None, None],
+            w_specs=[None, None],
+        )
+
+    def test_x_row_shard(self):
+        self.test_body(
+            x_shape=[12, 16],
+            w_shape=[10, 4],
+            x_specs=["x", None],
+            w_specs=[None, None],
+        )
+
+    def test_x_col_shard(self):
+        self.test_body(
+            x_shape=[12, 16],
+            w_shape=[10, 4],
+            x_specs=[None, "x"],
+            w_specs=[None, None],
+        )
+
+    def test_w_row_shard(self):
+        self.test_body(
+            x_shape=[12, 16],
+            w_shape=[10, 4],
+            x_specs=[None, None],
+            w_specs=["x", None],
+        )
+
+    def test_w_col_shard(self):
+        self.test_body(
+            x_shape=[12, 16],
+            w_shape=[10, 4],
+            x_specs=[None, None],
+            w_specs=[None, "x"],
+        )
+
+    def test_x_row_w_col_shard(self):
+        self.test_body(
+            x_shape=[12, 16],
+            w_shape=[10, 4],
+            x_specs=["x", None],
+            w_specs=[None, "x"],
+        )
+
+    def test_x_col_w_row_shard(self):
+        self.test_body(
+            x_shape=[12, 16],
+            w_shape=[10, 4],
+            x_specs=[None, "x"],
+            w_specs=["x", None],
+        )
+
+    def test_both_col_shard(self):
+        self.test_body(
+            x_shape=[12, 16],
+            w_shape=[10, 4],
+            x_specs=[None, "x"],
+            w_specs=[None, "x"],
+        )
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.test_non_shard()
+        self.test_x_row_shard()
+        self.test_x_col_shard()
+        self.test_w_row_shard()
+        self.test_w_col_shard()
+        self.test_x_row_w_col_shard()
+        self.test_x_col_w_row_shard()
+        self.test_both_col_shard()
+
+
+if __name__ == '__main__':
+    TestCustomEmbeddingGradApiForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/test_semi_auto_parallel_basic.py b/test/auto_parallel/test_semi_auto_parallel_basic.py
@@ -106,6 +106,16 @@ def test_custom_relu_api(self):
                 user_defined_envs=envs,
             )
 
+    def test_custom_embedding_grad_api(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_for_embedding_grad.py",
+                user_defined_envs=envs,
+            )
+
 
 if __name__ == "__main__":
     unittest.main()