From 8e08906bff8fdbb89daddc5c73a48bdaab1c55d1 Mon Sep 17 00:00:00 2001 From: lugimzzz Date: Thu, 4 Aug 2022 02:03:59 +0000 Subject: [PATCH 1/5] augmentation_bug_fix --- paddlenlp/data_augmentation/word_insert.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/paddlenlp/data_augmentation/word_insert.py b/paddlenlp/data_augmentation/word_insert.py index fcce741acd88..fb320e431867 100644 --- a/paddlenlp/data_augmentation/word_insert.py +++ b/paddlenlp/data_augmentation/word_insert.py @@ -106,22 +106,21 @@ def _load_insert_dict(self, source_type): return insert_dict def _augment(self, sequence): - if self.type == 'mlm': - return self._augment_mlm(sequence) seq_tokens = self.tokenizer.cut(sequence) aug_indexes = self._skip_stop_word_tokens(seq_tokens) aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes)) if aug_n == 0: return [] + elif self.type == 'mlm': + return self._augment_mlm(sequence, seq_tokens, aug_indexes) elif aug_n == 1: return self._augment_single(seq_tokens, aug_indexes) else: return self._augment_multi(seq_tokens, aug_n, aug_indexes) @paddle.no_grad() - def _augment_mlm(self, sequence): - seq_tokens = self.tokenizer.cut(sequence) - aug_indexes = self._skip_stop_word_tokens(seq_tokens) + def _augment_mlm(self, sequence, seq_tokens, aug_indexes): + t = 0 sentences = [] while t < self.create_n * self.loop and len(sentences) < self.create_n: From 8b9e881f1ea5e11dd183ed0ef2222d10e87a3758 Mon Sep 17 00:00:00 2001 From: lugimzzz Date: Fri, 5 Aug 2022 02:34:12 +0000 Subject: [PATCH 2/5] change_data_aug_name --- paddlenlp/{data_augmentation => dataaug}/__init__.py | 0 paddlenlp/{data_augmentation => dataaug}/base_augment.py | 0 paddlenlp/{data_augmentation => dataaug}/word_delete.py | 0 paddlenlp/{data_augmentation => dataaug}/word_insert.py | 0 paddlenlp/{data_augmentation => dataaug}/word_substitute.py | 0 paddlenlp/{data_augmentation => dataaug}/word_swap.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename paddlenlp/{data_augmentation => dataaug}/__init__.py (100%) rename paddlenlp/{data_augmentation => dataaug}/base_augment.py (100%) rename paddlenlp/{data_augmentation => dataaug}/word_delete.py (100%) rename paddlenlp/{data_augmentation => dataaug}/word_insert.py (100%) rename paddlenlp/{data_augmentation => dataaug}/word_substitute.py (100%) rename paddlenlp/{data_augmentation => dataaug}/word_swap.py (100%) diff --git a/paddlenlp/data_augmentation/__init__.py b/paddlenlp/dataaug/__init__.py similarity index 100% rename from paddlenlp/data_augmentation/__init__.py rename to paddlenlp/dataaug/__init__.py diff --git a/paddlenlp/data_augmentation/base_augment.py b/paddlenlp/dataaug/base_augment.py similarity index 100% rename from paddlenlp/data_augmentation/base_augment.py rename to paddlenlp/dataaug/base_augment.py diff --git a/paddlenlp/data_augmentation/word_delete.py b/paddlenlp/dataaug/word_delete.py similarity index 100% rename from paddlenlp/data_augmentation/word_delete.py rename to paddlenlp/dataaug/word_delete.py diff --git a/paddlenlp/data_augmentation/word_insert.py b/paddlenlp/dataaug/word_insert.py similarity index 100% rename from paddlenlp/data_augmentation/word_insert.py rename to paddlenlp/dataaug/word_insert.py diff --git a/paddlenlp/data_augmentation/word_substitute.py b/paddlenlp/dataaug/word_substitute.py similarity index 100% rename from paddlenlp/data_augmentation/word_substitute.py rename to paddlenlp/dataaug/word_substitute.py diff --git a/paddlenlp/data_augmentation/word_swap.py b/paddlenlp/dataaug/word_swap.py similarity index 100% rename from paddlenlp/data_augmentation/word_swap.py rename to paddlenlp/dataaug/word_swap.py From 26db214091bcd179527eafd5c48bda6b40695f66 Mon Sep 17 00:00:00 2001 From: lugimzzz Date: Fri, 5 Aug 2022 02:54:15 +0000 Subject: [PATCH 3/5] change_data_aug_name --- docs/augmentation.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/augmentation.md b/docs/augmentation.md index 8b484d212d62..b10922754bcd 100644 --- a/docs/augmentation.md +++ b/docs/augmentation.md @@ -12,7 +12,7 @@ PaddleNLP提供了Data Augmentation数据增强API,可用于训练数据数据 ## 词级别数据增强策略 ### 词替换 -词替换数据增强策略也即将句子中的词随机替换为其他单词进行数据增强,这里我们将介绍如何使用`paddlenlp.data_augmentation.WordSubstitute`进行词级别替换的数据增强。 +词替换数据增强策略也即将句子中的词随机替换为其他单词进行数据增强,这里我们将介绍如何使用`paddlenlp.dataaug.WordSubstitute`进行词级别替换的数据增强。 ```text WordSubstitute 参数介绍: @@ -49,7 +49,7 @@ WordSubstitute 参数介绍: 我们接下来将以下面的例子介绍词级别替换的使用: ``` python -from paddlenlp.data_augmentation import WordSubstitute +from paddlenlp.dataaug import WordSubstitute s1 = "人类语言是抽象的信息符号,其中蕴含着丰富的语义信息,人类可以很轻松地理解其中的含义。" s2 = "而计算机只能处理数值化的信息,无法直接理解人类语言,所以需要将人类语言进行数值化转换。" ``` @@ -207,7 +207,7 @@ print("augmented:", augmented[0]) 可以根据的实际需求,修改数据增强生成句子数 `create_n`和句子中被替换的词数量 `aug_n`。 ### 词插入 -词插入数据增强策略也即将句子中的词随机插入其他单词进行数据增强,这里我们将介绍如何使用`paddlenlp.data_augmentation.WordInsert`进行词级别插入的数据增强。 +词插入数据增强策略也即将句子中的词随机插入其他单词进行数据增强,这里我们将介绍如何使用`paddlenlp.dataaug.WordInsert`进行词级别插入的数据增强。 ```text WordInsert 参数介绍: @@ -238,7 +238,7 @@ WordInsert 参数介绍: 我们接下来将以下面的例子介绍词级别插入的使用: ``` python -from paddlenlp.data_augmentation import WordInsert +from paddlenlp.dataaug import WordInsert s1 = "人类语言是抽象的信息符号,其中蕴含着丰富的语义信息,人类可以很轻松地理解其中的含义。" s2 = "而计算机只能处理数值化的信息,无法直接理解人类语言,所以需要将人类语言进行数值化转换。" ``` @@ -376,7 +376,7 @@ print("augmented:", augmented[0]) ### 词删除 -词删除数据增强策略也即将句子中的词随机删除进行数据增强,这里我们将介绍如何使用`paddlenlp.data_augmentation.WordDelete`进行词级别删除的数据增强。 +词删除数据增强策略也即将句子中的词随机删除进行数据增强,这里我们将介绍如何使用`paddlenlp.dataaug.WordDelete`进行词级别删除的数据增强。 ```text WordDelete 参数介绍: @@ -400,7 +400,7 @@ WordDelete 参数介绍: 我们接下来将以下面的例子介绍词级别删除的使用: ``` python -from paddlenlp.data_augmentation import WordDelete +from paddlenlp.dataaug import WordDelete s1 = "人类语言是抽象的信息符号,其中蕴含着丰富的语义信息,人类可以很轻松地理解其中的含义。" s2 = "而计算机只能处理数值化的信息,无法直接理解人类语言,所以需要将人类语言进行数值化转换。" ``` @@ -455,7 +455,7 @@ for sentence, augmented in zip(sentences, augmenteds): ### 词交换 -词交换数据增强策略也即将句子中的词的位置随机交换进行数据增强,这里我们将介绍如何使用`paddlenlp.data_augmentation.WordSwap`进行词级别交换的数据增强。 +词交换数据增强策略也即将句子中的词的位置随机交换进行数据增强,这里我们将介绍如何使用`paddlenlp.dataaug.WordSwap`进行词级别交换的数据增强。 ```text WordSwap 参数介绍: @@ -479,7 +479,7 @@ WordSwap 参数介绍: 我们接下来将以下面的例子介绍词级别交换的使用: ``` python -from paddlenlp.data_augmentation import WordSwap +from paddlenlp.dataaug import WordSwap s1 = "人类语言是抽象的信息符号,其中蕴含着丰富的语义信息,人类可以很轻松地理解其中的含义。" s2 = "而计算机只能处理数值化的信息,无法直接理解人类语言,所以需要将人类语言进行数值化转换。" ``` From 781b871726ab92c12683da61ef1352b7058c380f Mon Sep 17 00:00:00 2001 From: lugimzzz Date: Fri, 5 Aug 2022 03:06:54 +0000 Subject: [PATCH 4/5] change_data_aug_name --- docs/{augmentation.md => dataaug.md} | 0 paddlenlp/__init__.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename docs/{augmentation.md => dataaug.md} (100%) diff --git a/docs/augmentation.md b/docs/dataaug.md similarity index 100% rename from docs/augmentation.md rename to docs/dataaug.md diff --git a/paddlenlp/__init__.py b/paddlenlp/__init__.py index 76a8c9c6f60e..8712f96b7f22 100644 --- a/paddlenlp/__init__.py +++ b/paddlenlp/__init__.py @@ -34,7 +34,7 @@ from . import experimental from .taskflow import Taskflow from . import trainer -from . import data_augmentation +from . import dataaug import paddle paddle.disable_signal_handler() From 63b034e545a614dab5335904eda8bd1ee54fd60e Mon Sep 17 00:00:00 2001 From: lugimzzz Date: Mon, 8 Aug 2022 04:02:50 +0000 Subject: [PATCH 5/5] change_data_aug_name --- paddlenlp/dataaug/word_delete.py | 2 ++ paddlenlp/dataaug/word_insert.py | 2 ++ paddlenlp/dataaug/word_substitute.py | 2 ++ paddlenlp/dataaug/word_swap.py | 2 ++ 4 files changed, 8 insertions(+) diff --git a/paddlenlp/dataaug/word_delete.py b/paddlenlp/dataaug/word_delete.py index fb35121e250a..a0fb79864bcf 100644 --- a/paddlenlp/dataaug/word_delete.py +++ b/paddlenlp/dataaug/word_delete.py @@ -15,6 +15,8 @@ from .base_augment import BaseAugment +__all__ = ['WordDelete'] + class WordDelete(BaseAugment): """ diff --git a/paddlenlp/dataaug/word_insert.py b/paddlenlp/dataaug/word_insert.py index fb320e431867..b60a82950329 100644 --- a/paddlenlp/dataaug/word_insert.py +++ b/paddlenlp/dataaug/word_insert.py @@ -22,6 +22,8 @@ from ..transformers import AutoModelForMaskedLM, AutoTokenizer from .base_augment import BaseAugment +__all__ = ['WordInsert'] + class WordInsert(BaseAugment): """ diff --git a/paddlenlp/dataaug/word_substitute.py b/paddlenlp/dataaug/word_substitute.py index 66dbc45566a5..b5a907149d73 100644 --- a/paddlenlp/dataaug/word_substitute.py +++ b/paddlenlp/dataaug/word_substitute.py @@ -24,6 +24,8 @@ from ..transformers import AutoModelForMaskedLM, AutoTokenizer from .base_augment import BaseAugment +__all__ = ['WordSubstitute'] + class WordSubstitute(BaseAugment): """ diff --git a/paddlenlp/dataaug/word_swap.py b/paddlenlp/dataaug/word_swap.py index 4a7b9319c5b1..2812b88d9b55 100644 --- a/paddlenlp/dataaug/word_swap.py +++ b/paddlenlp/dataaug/word_swap.py @@ -15,6 +15,8 @@ from .base_augment import BaseAugment +__all__ = ['WordSwap'] + class WordSwap(BaseAugment): """