PaddlePaddle · lugimzzz · Aug 9, 2022 · Aug 4, 2022 · Aug 5, 2022 · Aug 5, 2022
diff --git a/docs/augmentation.md → docs/dataaug.md b/docs/augmentation.md → docs/dataaug.md
@@ -12,7 +12,7 @@ PaddleNLP提供了Data Augmentation数据增强API，可用于训练数据数据
 ## 词级别数据增强策略
 
 ### 词替换
-词替换数据增强策略也即将句子中的词随机替换为其他单词进行数据增强，这里我们将介绍如何使用`paddlenlp.data_augmentation.WordSubstitute`进行词级别替换的数据增强。
+词替换数据增强策略也即将句子中的词随机替换为其他单词进行数据增强，这里我们将介绍如何使用`paddlenlp.dataaug.WordSubstitute`进行词级别替换的数据增强。
 
 ```text
 WordSubstitute 参数介绍：
@@ -49,7 +49,7 @@ WordSubstitute 参数介绍：
 我们接下来将以下面的例子介绍词级别替换的使用：
 
 ``` python
-from paddlenlp.data_augmentation import WordSubstitute
+from paddlenlp.dataaug import WordSubstitute
 s1 = "人类语言是抽象的信息符号，其中蕴含着丰富的语义信息，人类可以很轻松地理解其中的含义。"
 s2 = "而计算机只能处理数值化的信息，无法直接理解人类语言，所以需要将人类语言进行数值化转换。"
 ```
@@ -207,7 +207,7 @@ print("augmented:", augmented[0])
 可以根据的实际需求，修改数据增强生成句子数 `create_n`和句子中被替换的词数量 `aug_n`。
 
 ### 词插入
-词插入数据增强策略也即将句子中的词随机插入其他单词进行数据增强，这里我们将介绍如何使用`paddlenlp.data_augmentation.WordInsert`进行词级别插入的数据增强。
+词插入数据增强策略也即将句子中的词随机插入其他单词进行数据增强，这里我们将介绍如何使用`paddlenlp.dataaug.WordInsert`进行词级别插入的数据增强。
 
 ```text
 WordInsert 参数介绍：
@@ -238,7 +238,7 @@ WordInsert 参数介绍：
 我们接下来将以下面的例子介绍词级别插入的使用：
 
 ``` python
-from paddlenlp.data_augmentation import WordInsert
+from paddlenlp.dataaug import WordInsert
 s1 = "人类语言是抽象的信息符号，其中蕴含着丰富的语义信息，人类可以很轻松地理解其中的含义。"
 s2 = "而计算机只能处理数值化的信息，无法直接理解人类语言，所以需要将人类语言进行数值化转换。"
 ```
@@ -376,7 +376,7 @@ print("augmented:", augmented[0])
 
 ### 词删除
 
-词删除数据增强策略也即将句子中的词随机删除进行数据增强，这里我们将介绍如何使用`paddlenlp.data_augmentation.WordDelete`进行词级别删除的数据增强。
+词删除数据增强策略也即将句子中的词随机删除进行数据增强，这里我们将介绍如何使用`paddlenlp.dataaug.WordDelete`进行词级别删除的数据增强。
 
 ```text
 WordDelete 参数介绍：
@@ -400,7 +400,7 @@ WordDelete 参数介绍：
 我们接下来将以下面的例子介绍词级别删除的使用：
 
 ``` python
-from paddlenlp.data_augmentation import WordDelete
+from paddlenlp.dataaug import WordDelete
 s1 = "人类语言是抽象的信息符号，其中蕴含着丰富的语义信息，人类可以很轻松地理解其中的含义。"
 s2 = "而计算机只能处理数值化的信息，无法直接理解人类语言，所以需要将人类语言进行数值化转换。"
 ```
@@ -455,7 +455,7 @@ for sentence, augmented in zip(sentences, augmenteds):
 
 ### 词交换
 
-词交换数据增强策略也即将句子中的词的位置随机交换进行数据增强，这里我们将介绍如何使用`paddlenlp.data_augmentation.WordSwap`进行词级别交换的数据增强。
+词交换数据增强策略也即将句子中的词的位置随机交换进行数据增强，这里我们将介绍如何使用`paddlenlp.dataaug.WordSwap`进行词级别交换的数据增强。
 
 ```text
 WordSwap 参数介绍：
@@ -479,7 +479,7 @@ WordSwap 参数介绍：
 我们接下来将以下面的例子介绍词级别交换的使用：
 
 ``` python
-from paddlenlp.data_augmentation import WordSwap
+from paddlenlp.dataaug import WordSwap
 s1 = "人类语言是抽象的信息符号，其中蕴含着丰富的语义信息，人类可以很轻松地理解其中的含义。"
 s2 = "而计算机只能处理数值化的信息，无法直接理解人类语言，所以需要将人类语言进行数值化转换。"
 ```

diff --git a/paddlenlp/__init__.py b/paddlenlp/__init__.py
@@ -34,7 +34,7 @@
 from . import experimental
 from .taskflow import Taskflow
 from . import trainer
-from . import data_augmentation
+from . import dataaug
 import paddle
 
 paddle.disable_signal_handler()

diff --git a/paddlenlp/data_augmentation/__init__.py → paddlenlp/dataaug/__init__.py b/paddlenlp/data_augmentation/__init__.py → paddlenlp/dataaug/__init__.py
diff --git a/paddlenlp/data_augmentation/base_augment.py → paddlenlp/dataaug/base_augment.py b/paddlenlp/data_augmentation/base_augment.py → paddlenlp/dataaug/base_augment.py
diff --git a/paddlenlp/data_augmentation/word_delete.py → paddlenlp/dataaug/word_delete.py b/paddlenlp/data_augmentation/word_delete.py → paddlenlp/dataaug/word_delete.py
@@ -15,6 +15,8 @@
 
 from .base_augment import BaseAugment
 
+__all__ = ['WordDelete']
+
 
 class WordDelete(BaseAugment):
     """

diff --git a/paddlenlp/data_augmentation/word_insert.py → paddlenlp/dataaug/word_insert.py b/paddlenlp/data_augmentation/word_insert.py → paddlenlp/dataaug/word_insert.py
@@ -22,6 +22,8 @@
 from ..transformers import AutoModelForMaskedLM, AutoTokenizer
 from .base_augment import BaseAugment
 
+__all__ = ['WordInsert']
+
 
 class WordInsert(BaseAugment):
     """
@@ -106,22 +108,21 @@ def _load_insert_dict(self, source_type):
         return insert_dict
 
     def _augment(self, sequence):
-        if self.type == 'mlm':
-            return self._augment_mlm(sequence)
         seq_tokens = self.tokenizer.cut(sequence)
         aug_indexes = self._skip_stop_word_tokens(seq_tokens)
         aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes))
         if aug_n == 0:
             return []
+        elif self.type == 'mlm':
+            return self._augment_mlm(sequence, seq_tokens, aug_indexes)
         elif aug_n == 1:
             return self._augment_single(seq_tokens, aug_indexes)
         else:
             return self._augment_multi(seq_tokens, aug_n, aug_indexes)
 
     @paddle.no_grad()
-    def _augment_mlm(self, sequence):
-        seq_tokens = self.tokenizer.cut(sequence)
-        aug_indexes = self._skip_stop_word_tokens(seq_tokens)
+    def _augment_mlm(self, sequence, seq_tokens, aug_indexes):
+
         t = 0
         sentences = []
         while t < self.create_n * self.loop and len(sentences) < self.create_n:

diff --git a/...enlp/data_augmentation/word_substitute.py → paddlenlp/dataaug/word_substitute.py b/...enlp/data_augmentation/word_substitute.py → paddlenlp/dataaug/word_substitute.py
@@ -24,6 +24,8 @@
 from ..transformers import AutoModelForMaskedLM, AutoTokenizer
 from .base_augment import BaseAugment
 
+__all__ = ['WordSubstitute']
+
 
 class WordSubstitute(BaseAugment):
     """

diff --git a/paddlenlp/data_augmentation/word_swap.py → paddlenlp/dataaug/word_swap.py b/paddlenlp/data_augmentation/word_swap.py → paddlenlp/dataaug/word_swap.py
@@ -15,6 +15,8 @@
 
 from .base_augment import BaseAugment
 
+__all__ = ['WordSwap']
+
 
 class WordSwap(BaseAugment):
     """
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,8 @@ @@
     from .base_augment import BaseAugment
+    __all__ = ['WordDelete']
     class WordDelete(BaseAugment):
         """
@@ Expand Down @@