part-3 cherry from: add check for cembedding (PaddlePaddle#55621)

hitywt · Nov 23, 2023 · f457d4e · f457d4e
1 parent ded4c9d
commit f457d4e
Show file tree

Hide file tree

Showing 10 changed files with 35 additions and 18 deletions.
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -87,6 +87,10 @@ class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
                      "(int64, default 0), The starting index is indeed, "
                      "and the out-of-bounds will be set to 0 ")
         .SetDefault(0);
+    AddAttr<int64_t>("vocab_size",
+                     "(int64, default -1), The total vocabulary size to check"
+                     "the out-of-bounds ids. If it is -1, no check will be ")
+        .SetDefault(-1);
     AddComment(R"DOC(
 c_embedding Operator.
 

diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -104,7 +104,7 @@
   backward : batch_norm_double_grad
 
 - backward_op : c_embedding_grad
-  forward : c_embedding (Tensor weight, Tensor x, int64_t start_index=0) -> Tensor(out)
+  forward : c_embedding (Tensor weight, Tensor x, int64_t start_index=0, int64_t vocab_size=-1) -> Tensor(out)
   args : (Tensor weight, Tensor x, Tensor out_grad, int64_t start_index=0)
   output : Tensor(weight_grad)
   infer_meta :

diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -173,14 +173,14 @@
     func : c_concat
 
 - op : c_embedding
-  args : (Tensor weight, Tensor x, int64_t start_index=0)
+  args : (Tensor weight, Tensor x, int64_t start_index=0, int64_t vocab_size=-1)
   output : Tensor(out)
   infer_meta :
     func : CEmbeddingInferMeta
     param : [weight, x, start_index]
   kernel :
     func : c_embedding
-    param : [weight, x, start_index]
+    param : [weight, x, start_index, vocab_size]
     data_type : weight
   backward : c_embedding_grad
 

diff --git a/paddle/phi/kernels/c_embedding_kernel.h b/paddle/phi/kernels/c_embedding_kernel.h
@@ -22,6 +22,7 @@ void CEmbeddingKernel(const Context& ctx,
                       const DenseTensor& w,
                       const DenseTensor& ids,
                       int64_t start_index,
+                      int64_t vocab_size,
                       DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/c_embedding_kernel.cc b/paddle/phi/kernels/cpu/c_embedding_kernel.cc
@@ -46,6 +46,7 @@ void CEmbeddingKernel(const Context& ctx,
                       const DenseTensor& w,
                       const DenseTensor& ids,
                       int64_t start_index,
+                      int64_t vocab_size,
                       DenseTensor* out) {
   VLOG(10) << "table_dims:" << w.dims();
   const T* table_data = w.data<T>();

diff --git a/paddle/phi/kernels/gpu/c_embedding_kernel.cu b/paddle/phi/kernels/gpu/c_embedding_kernel.cu
@@ -36,21 +36,23 @@ __global__ void CEmbedding(T* out,
                            const int64_t N,
                            const int64_t start_idx,
                            const int64_t end_idx,
+                           const int64_t vocab_size,
                            const int64_t limit) {
   CUDA_KERNEL_LOOP(i, limit) {
     size_t row = i / columns;
     size_t col = i % columns;
     auto id = ids[row];
 
+    PADDLE_ENFORCE(
+        id >= 0 && (vocab_size < 0 || id < vocab_size),
+        "The index is out of bounds, "
+        "please check whether the dimensions of index and "
+        "input meet the requirements. It should "
+        "be less than [%d] and greater than or equal to 0, but received [%d]",
+        vocab_size,
+        id);
     if (id >= start_idx && id < end_idx) {
       auto real_idx = id - start_idx;
-      PADDLE_ENFORCE(real_idx < N,
-                     "The index is out of bounds, "
-                     "please check whether the dimensions of index and "
-                     "input meet the requirements. It should "
-                     "be less than [%d], but received [%d]",
-                     N,
-                     real_idx);
       out[i] = table[real_idx * columns + col];
     } else {
       out[i] = static_cast<T>(0);
@@ -63,6 +65,7 @@ void CEmbeddingKernel(const Context& ctx,
                       const DenseTensor& w,
                       const DenseTensor& ids,
                       int64_t start_index,
+                      int64_t vocab_size,
                       DenseTensor* out) {
   size_t N = w.dims()[0];
   size_t D = w.dims()[1];
@@ -87,6 +90,7 @@ void CEmbeddingKernel(const Context& ctx,
                                                D,
                                                N,
                                                start_index,
+                                               vocab_size,
                                                end_idx,
                                                limit);
 
@@ -99,6 +103,7 @@ void CEmbeddingKernel(const Context& ctx,
                                                D,
                                                N,
                                                start_index,
+                                               vocab_size,
                                                end_idx,
                                                limit);
   } else {

diff --git a/paddle/phi/kernels/xpu/c_embedding_kernel.cc b/paddle/phi/kernels/xpu/c_embedding_kernel.cc
@@ -24,6 +24,7 @@ void CEmbeddingKernel(const Context& dev_ctx,
                       const DenseTensor& w,
                       const DenseTensor& ids,
                       int64_t start_index,
+                      int64_t vocab_size,
                       DenseTensor* out) {
   const T* table_data = w.data<T>();
   T* output_data = dev_ctx.template Alloc<T>(out);

diff --git a/paddle/phi/ops/compat/c_embedding_sig.cc b/paddle/phi/ops/compat/c_embedding_sig.cc
@@ -16,7 +16,8 @@
 
 namespace phi {
 KernelSignature CEmbeddingOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("c_embedding", {"W", "Ids"}, {"start_index"}, {"Out"});
+  return KernelSignature(
+      "c_embedding", {"W", "Ids"}, {"start_index"}, {"vocab_size"}, {"Out"});
 }
 
 KernelSignature CEmbeddingGradOpArgumentMapping(

diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -134,6 +134,7 @@ def __init__(
         self._size = [per_part_size, embedding_dim]
         self._weight_attr = weight_attr
         self._name = name
+        self.num_embeddings = num_embeddings
 
         if self.is_mp and paddle.in_dynamic_mode():
             with get_rng_state_tracker().rng_state():
@@ -161,6 +162,7 @@ def forward(self, x):
                 self.weight,
                 x,
                 start_index=self.vocab_start_index,
+                vocab_size=self.num_embeddings,
                 name=self._name,
             )
             output = mp_ops._mp_allreduce(

diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle
-from paddle import _legacy_C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.autograd import PyLayer
 from paddle.base.data_feeder import check_dtype, check_variable_and_dtype
 from paddle.distributed import collective
@@ -330,7 +330,7 @@ def _mp_allreduce(
         return out
 
 
-def _c_lookup_table(table, index, start_index=0, name=None):
+def _c_lookup_table(table, index, start_index=0, vocab_size=-1, name=None):
     """
     Lookup table according to index.
 
@@ -345,9 +345,7 @@ def _c_lookup_table(table, index, start_index=0, name=None):
         Tensor.
     """
     if in_dynamic_mode():
-        return _legacy_C_ops.c_embedding(
-            table, index, "start_index", start_index
-        )
+        return _C_ops.c_embedding(table, index, start_index, vocab_size)
     else:
         op_type = 'c_embedding'
         helper = LayerHelper(op_type, **locals())
@@ -358,7 +356,7 @@ def _c_lookup_table(table, index, start_index=0, name=None):
             type='c_embedding',
             inputs={'Ids': index, 'W': table},
             outputs={'Out': tmp},
-            attrs={"start_index": start_index},
+            attrs={"start_index": start_index, "vocab_size": vocab_size},
         )
         return tmp
 
@@ -684,7 +682,11 @@ def _parallel_embedding(
     main_block.vars[weight.name].is_distributed = True
 
     output_parallel = _c_lookup_table(
-        weight, x, start_index=vocab_start_index, name=name
+        weight,
+        x,
+        start_index=vocab_start_index,
+        vocab_size=origin_size[0],
+        name=name,
     )
     out = _mp_allreduce(
         output_parallel,