PaddlePaddle · lugimzzz · Sep 5, 2022 · Sep 2, 2022 · Sep 5, 2022 · Sep 5, 2022
diff --git a/applications/text_classification/hierarchical/README.md b/applications/text_classification/hierarchical/README.md
@@ -47,6 +47,7 @@
 wget https://paddlenlp.bj.bcebos.com/datasets/baidu_extract_2020.tar.gz
 tar -zxvf baidu_extract_2020.tar.gz
 mv baidu_extract_2020 data
+rm baidu_extract_2020.tar.gz
 ```
 
 <div align="center">
@@ -194,6 +195,7 @@ data/
 使用CPU/GPU训练，默认为GPU训练，使用CPU训练只需将设备参数配置改为`--device "cpu"`：
 ```shell
 python train.py \
+    --dataset_dir "data" \
     --device "gpu" \
     --max_seq_length 128 \
     --model_name "ernie-3.0-medium-zh" \
@@ -205,6 +207,7 @@ python train.py \
 如果在CPU环境下训练，可以指定`nproc_per_node`参数进行多核训练：
 ```shell
 python -m paddle.distributed.launch --nproc_per_node 8 --backend "gloo" train.py \
+    --dataset_dir "data" \
     --device "gpu" \
     --max_seq_length 128 \
     --model_name "ernie-3.0-medium-zh" \
@@ -217,6 +220,7 @@ python -m paddle.distributed.launch --nproc_per_node 8 --backend "gloo" train.py
 ```shell
 unset CUDA_VISIBLE_DEVICES
 python -m paddle.distributed.launch --gpus "0" train.py \
+    --dataset_dir "data" \
     --device "gpu" \
     --max_seq_length 128 \
     --model_name "ernie-3.0-medium-zh" \
@@ -260,13 +264,13 @@ checkpoint/
 **NOTE:**
 * 如需恢复模型训练，则可以设置 `--init_from_ckpt checkpoint/model_state.pdparams` 。
 * 如需训练英文文本分类任务，只需更换预训练模型参数 `model_name` 。英文训练任务推荐使用"ernie-2.0-base-en"，更多可选模型可参考[Transformer预训练模型](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/index.html#transformer)。
-
+* 英文和中文以外文本分类任务建议使用多语言预训练模型"ernie-m-base","ernie-m-large"， 多语言模型暂不支持文本分类模型部署，相关功能正在加速开发中。
 #### 2.4.2 训练评估与模型优化
 
 训练后的模型我们可以使用 [模型分析模块](./analysis) 对每个类别分别进行评估，并输出预测错误样本（bad case），默认在GPU环境下使用，在CPU环境下修改参数配置为`--device "cpu"`:
 
 ```shell
-python analysis/evaluate.py --device "gpu" --max_seq_length 128 --batch_size 32 --bad_case_path "./bad_case.txt"
+python analysis/evaluate.py --device "gpu" --max_seq_length 128 --batch_size 32 --bad_case_path "./bad_case.txt" --dataset_dir "data" --params_path "./checkpoint"
 ```
 
 输出打印示例：
@@ -307,7 +311,7 @@ Prediction    Label    Text
 训练结束后，输入待预测数据(data.txt)和类别标签对照列表(label.txt)，使用训练好的模型进行，默认在GPU环境下使用，在CPU环境下修改参数配置为`--device "cpu"`：
 
 ```shell
-python predict.py --device "gpu" --max_seq_length 128 --batch_size 32
+python predict.py --device "gpu" --max_seq_length 128 --batch_size 32 --dataset_dir "data"
 ```
 
 可支持配置的参数：
@@ -361,10 +365,14 @@ pip install paddleslim==2.2.2
 ```shell
 python prune.py \
     --device "gpu" \
+    --dataset_dir "data" \
+    --output_dir "prune" \
     --per_device_train_batch_size 32 \
     --per_device_eval_batch_size 32 \
     --num_train_epochs 10 \
     --max_seq_length 128 \
+    --logging_steps 5 \
+    --save_steps 100 \
     --width_mult_list '3/4' '2/3' '1/2'
 ```
 
@@ -376,7 +384,7 @@ python prune.py \
 * `per_device_eval_batch_size`：开发集评测过程批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为32。
 * `learning_rate`：训练最大学习率；默认为3e-5。
 * `num_train_epochs`: 训练轮次，使用早停法时可以选择100；默认为10。
-* `logging_steps`: 训练过程中日志打印的间隔steps数，默认5。
+* `logging_steps`: 训练过程中日志打印的间隔steps数，默认100。
 * `save_steps`: 训练过程中保存模型checkpoint的间隔steps数，默认100。
 * `seed`：随机种子，默认为3。
 * `width_mult_list`：裁剪宽度（multi head）保留的比例列表，表示对self_attention中的 `q`、`k`、`v` 以及 `ffn` 权重宽度的保留比例，保留比例乘以宽度（multi haed数量）应为整数；默认是None。

diff --git a/applications/text_classification/hierarchical/analysis/evaluate.py b/applications/text_classification/hierarchical/analysis/evaluate.py
@@ -142,9 +142,8 @@ def evaluate():
     probs = []
     labels = []
     for batch in train_data_loader:
-        input_ids, token_type_ids, label = batch['input_ids'], batch[
-            'token_type_ids'], batch['labels']
-        logits = model(input_ids, token_type_ids)
+        label = batch.pop("labels")
+        logits = model(**batch)
         labels.extend(label.numpy())
         probs.extend(F.sigmoid(logits).numpy())
     probs = np.array(probs)
@@ -158,9 +157,8 @@ def evaluate():
     probs = []
     labels = []
     for batch in dev_data_loader:
-        input_ids, token_type_ids, label = batch['input_ids'], batch[
-            'token_type_ids'], batch['labels']
-        logits = model(input_ids, token_type_ids)
+        label = batch.pop("labels")
+        logits = model(**batch)
         labels.extend(label.numpy())
         probs.extend(F.sigmoid(logits).numpy())
     probs = np.array(probs)

diff --git a/applications/text_classification/hierarchical/predict.py b/applications/text_classification/hierarchical/predict.py
@@ -14,15 +14,19 @@
 
 import os
 import argparse
-
+import functools
 import numpy as np
 
 import paddle
 import paddle.nn.functional as F
 from paddlenlp.utils.log import logger
-from paddlenlp.data import Tuple, Pad
+from paddle.io import DataLoader, BatchSampler
+from paddlenlp.data import DataCollatorWithPadding
+from paddlenlp.datasets import load_dataset
 from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
 
+from utils import preprocess_function, read_local_dataset
+
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument('--device', default="gpu", help="Select which device to train model, defaults to gpu.")
@@ -37,42 +41,47 @@
 
 
 @paddle.no_grad()
-def predict(data, label_list):
+def predict():
     """
-    Predicts the data labels.
-    Args:
-
-        data (obj:`List`): The processed data whose each element is one sequence.
-        label_map(obj:`List`): The label id (key) to label str (value) map.
-
+    Predicts the data labels. 
     """
     paddle.set_device(args.device)
     model = AutoModelForSequenceClassification.from_pretrained(args.params_path)
     tokenizer = AutoTokenizer.from_pretrained(args.params_path)
 
-    examples = []
-    for text in data:
-        result = tokenizer(text=text, max_seq_len=args.max_seq_length)
-        examples.append((result['input_ids'], result['token_type_ids']))
+    label_list = []
+    label_path = os.path.join(args.dataset_dir, args.label_file)
+    with open(label_path, 'r', encoding='utf-8') as f:
+        for i, line in enumerate(f):
+            label_list.append(line.strip())
+
+    data_ds = load_dataset(read_local_dataset,
+                           path=os.path.join(args.dataset_dir, args.data_file),
+                           is_test=True,
+                           lazy=False)
+
+    trans_func = functools.partial(preprocess_function,
+                                   tokenizer=tokenizer,
+                                   max_seq_length=args.max_seq_length,
+                                   label_nums=len(label_list),
+                                   is_test=True)
 
-    # Seperates data into some batches.
-    batches = [
-        examples[i:i + args.batch_size]
-        for i in range(0, len(examples), args.batch_size)
-    ]
+    data_ds = data_ds.map(trans_func)
 
-    batchify_fn = lambda samples, fn=Tuple(
-        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
-        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
-    ): fn(samples)
+    # batchify dataset
+    collate_fn = DataCollatorWithPadding(tokenizer)
+    data_batch_sampler = BatchSampler(data_ds,
+                                      batch_size=args.batch_size,
+                                      shuffle=False)
+
+    data_data_loader = DataLoader(dataset=data_ds,
+                                  batch_sampler=data_batch_sampler,
+                                  collate_fn=collate_fn)
 
     results = []
     model.eval()
-    for batch in batches:
-        input_ids, token_type_ids = batchify_fn(batch)
-        input_ids = paddle.to_tensor(input_ids)
-        token_type_ids = paddle.to_tensor(token_type_ids)
-        logits = model(input_ids, token_type_ids)
+    for batch in data_data_loader:
+        logits = model(**batch)
         probs = F.sigmoid(logits).numpy()
         for prob in probs:
             labels = []
@@ -81,9 +90,9 @@ def predict(data, label_list):
                     labels.append(label_list[i])
             results.append(labels)
 
-    for text, labels in zip(data, results):
+    for t, labels in zip(data_ds.data, results):
         hierarchical_labels = {}
-        logger.info("text: {}".format(text))
+        logger.info("text: {}".format(t["sentence"]))
         logger.info("prediction result: {}".format(",".join(labels)))
         for label in labels:
             for i, l in enumerate(label.split('##')):
@@ -100,22 +109,4 @@ def predict(data, label_list):
 
 if __name__ == "__main__":
 
-    data_dir = os.path.join(args.dataset_dir, args.data_file)
-    label_dir = os.path.join(args.dataset_dir, args.label_file)
-
-    data = []
-    label_list = []
-
-    with open(data_dir, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-        for i, line in enumerate(lines):
-            data.append(line.strip())
-    f.close()
-
-    with open(label_dir, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-        for i, line in enumerate(lines):
-            label_list.append(line.strip())
-    f.close()
-
-    predict(data, label_list)
+    predict()
diff --git a/applications/text_classification/hierarchical/train.py b/applications/text_classification/hierarchical/train.py
@@ -40,7 +40,7 @@
 parser.add_argument("--save_dir", default="./checkpoint", type=str, help="The output directory where the model checkpoints will be written.")
 parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
 parser.add_argument('--model_name', default="ernie-3.0-medium-zh", help="Select model to train, defaults to ernie-3.0-medium-zh.",
-                    choices=["ernie-3.0-xbase-zh", "ernie-3.0-base-zh", "ernie-3.0-medium-zh", "ernie-3.0-micro-zh", "ernie-3.0-mini-zh", "ernie-3.0-nano-zh", "ernie-2.0-base-en", "ernie-2.0-large-en"])
+                    choices=["ernie-3.0-xbase-zh", "ernie-3.0-base-zh", "ernie-3.0-medium-zh", "ernie-3.0-micro-zh", "ernie-3.0-mini-zh", "ernie-3.0-nano-zh", "ernie-2.0-base-en", "ernie-2.0-large-en","ernie-m-base","ernie-m-large"])
 parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
 parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.")
 parser.add_argument("--epochs", default=100, type=int, help="Total number of training epochs to perform.")
@@ -178,10 +178,8 @@ def train():
 
         for step, batch in enumerate(train_data_loader, start=1):
 
-            input_ids, token_type_ids, labels = batch['input_ids'], batch[
-                'token_type_ids'], batch['labels']
-
-            logits = model(input_ids, token_type_ids)
+            labels = batch.pop("labels")
+            logits = model(**batch)
             loss = criterion(logits, labels)
 
             probs = F.sigmoid(logits)

diff --git a/applications/text_classification/hierarchical/utils.py b/applications/text_classification/hierarchical/utils.py
@@ -34,9 +34,8 @@ def evaluate(model, criterion, metric, data_loader):
     metric.reset()
     losses = []
     for batch in data_loader:
-        input_ids, token_type_ids, labels = batch['input_ids'], batch[
-            'token_type_ids'], batch['labels']
-        logits = model(input_ids, token_type_ids)
+        labels = batch.pop("labels")
+        logits = model(**batch)
         loss = criterion(logits, labels)
         probs = F.sigmoid(logits)
         losses.append(loss.numpy())
@@ -51,7 +50,11 @@ def evaluate(model, criterion, metric, data_loader):
     return micro_f1_score, macro_f1_score
 
 
-def preprocess_function(examples, tokenizer, max_seq_length, label_nums):
+def preprocess_function(examples,
+                        tokenizer,
+                        max_seq_length,
+                        label_nums,
+                        is_test=False):
     """
     Builds model inputs from a sequence for sequence classification tasks
     by concatenating and adding special tokens.
@@ -68,21 +71,27 @@ def preprocess_function(examples, tokenizer, max_seq_length, label_nums):
     """
     result = tokenizer(text=examples["sentence"], max_seq_len=max_seq_length)
     # One-Hot label
-    result["labels"] = [
-        float(1) if i in examples["label"] else float(0)
-        for i in range(label_nums)
-    ]
+    if not is_test:
+        result["labels"] = [
+            float(1) if i in examples["label"] else float(0)
+            for i in range(label_nums)
+        ]
     return result
 
 
-def read_local_dataset(path, label_list):
+def read_local_dataset(path, label_list=None, is_test=False):
     """
     Read dataset
     """
     with open(path, 'r', encoding='utf-8') as f:
         for line in f:
-            items = line.strip().split('\t')
-            sentence = ''.join(items[:-1])
-            label = items[-1]
-            labels = [label_list[l] for l in label.split(',')]
-            yield {'sentence': sentence, 'label': labels}
+            if is_test:
+                items = line.strip().split('\t')
+                sentence = ''.join(items)
+                yield {'sentence': sentence}
+            else:
+                items = line.strip().split('\t')
+                sentence = ''.join(items[:-1])
+                label = items[-1]
+                labels = [label_list[l] for l in label.split(',')]
+                yield {'sentence': sentence, 'label': labels}