From 9a51345a907165510ede734d06d00bd270b5c5d2 Mon Sep 17 00:00:00 2001 From: w5688414 Date: Thu, 1 Sep 2022 03:02:57 +0000 Subject: [PATCH 1/3] Integrate Neural Search models into Pipelines --- .../recall/in_batch_negative/README.md | 69 +++++--- .../deploy/{C++ => cpp}/http_client.py | 2 +- .../deploy/{C++ => cpp}/rpc_client.py | 2 +- .../deploy/{C++ => cpp}/start_server.sh | 0 .../deploy/python/predict.py | 35 ++-- .../deploy/python/web_service.py | 5 +- .../recall/in_batch_negative/evaluate.py | 3 - .../recall/in_batch_negative/export_model.py | 5 +- .../recall/in_batch_negative/inference.py | 5 +- .../recall/in_batch_negative/predict.py | 5 +- .../recall/in_batch_negative/recall.py | 6 +- .../in_batch_negative/scripts/export_model.sh | 4 +- .../in_batch_negative/scripts/predict.sh | 4 +- .../scripts/run_build_index.sh | 1 + .../scripts/train_batch_neg.sh | 62 ------- .../in_batch_negative/train_batch_neg.py | 5 +- .../neural_search/recall/simcse/README.md | 31 ++-- .../recall/simcse/deploy/python/predict.py | 35 ++-- .../neural_search/recall/simcse/evaluate.py | 1 - .../recall/simcse/export_model.py | 6 +- .../neural_search/recall/simcse/inference.py | 5 +- .../neural_search/recall/simcse/predict.py | 6 +- .../neural_search/recall/simcse/recall.py | 8 +- .../recall/simcse/scripts/export_model.sh | 4 +- .../recall/simcse/scripts/predict.sh | 3 +- .../recall/simcse/scripts/run_build_index.sh | 5 +- .../recall/simcse/scripts/train.sh | 4 +- .../neural_search/recall/simcse/train.py | 4 +- pipelines/examples/semantic-search/README.md | 2 +- .../run_neural_search_server.sh | 5 + .../semantic-search/run_search_web.sh | 1 + .../semantic_search_example.py | 85 +++++++--- pipelines/pipelines/nodes/models/__init__.py | 15 ++ .../nodes/models/neural_search_model.py | 151 ++++++++++++++++++ pipelines/pipelines/nodes/retriever/dense.py | 31 +++- .../rest_api/pipeline/semantic_search.yaml | 2 +- .../pipeline/semantic_search_custom.yaml | 67 ++++++++ pipelines/utils/offline_ann.py | 40 +++-- 38 files changed, 502 insertions(+), 222 deletions(-) rename applications/neural_search/recall/in_batch_negative/deploy/{C++ => cpp}/http_client.py (97%) rename applications/neural_search/recall/in_batch_negative/deploy/{C++ => cpp}/rpc_client.py (97%) rename applications/neural_search/recall/in_batch_negative/deploy/{C++ => cpp}/start_server.sh (100%) delete mode 100644 applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh create mode 100644 pipelines/examples/semantic-search/run_neural_search_server.sh create mode 100644 pipelines/pipelines/nodes/models/__init__.py create mode 100644 pipelines/pipelines/nodes/models/neural_search_model.py create mode 100644 pipelines/rest_api/pipeline/semantic_search_custom.yaml diff --git a/applications/neural_search/recall/in_batch_negative/README.md b/applications/neural_search/recall/in_batch_negative/README.md index 326b34bf89c1..151a26f10b4a 100644 --- a/applications/neural_search/recall/in_batch_negative/README.md +++ b/applications/neural_search/recall/in_batch_negative/README.md @@ -42,7 +42,7 @@ In-batch Negatives 策略的训练数据为语义相似的 Pair 对,策略核 ### 技术方案 -双塔模型,采用ERNIE1.0热启,在召回训练阶段引入In-batch Negatives 策略,使用hnswlib建立索引库,进行召回测试。 +双塔模型,在召回训练阶段引入In-batch Negatives 策略,使用hnswlib建立索引库,进行召回测试。 ### 评估指标 @@ -53,10 +53,10 @@ Recall@K召回率是指预测的前topK(top-k是指从最后的按得分排序 **效果评估** -| 模型 | Recall@1 | Recall@5 |Recall@10 |Recall@20 |Recall@50 |策略简要说明| +| 策略 | 模型 | Recall@1 | Recall@5 |Recall@10 |Recall@20 |Recall@50 | | ------------ | ------------ | ------------ |--------- |--------- |--------- |--------- | -| In-batch Negatives | 51.301 | 65.309| 69.878| 73.996|78.881| Inbatch-negative有监督训练| - +| In-batch Negatives | ernie 1.0 | 51.301 | 65.309| 69.878| 73.996|78.881| +| In-batch Negatives | rocketqa-zh-base-query-encoder | **59.622** | **75.089**| **79.668**| **83.404**|**87.773**| @@ -166,10 +166,10 @@ Recall@K召回率是指预测的前topK(top-k是指从最后的按得分排序 |Model|训练参数配置|硬件|MD5| | ------------ | ------------ | ------------ |-----------| -|[batch_neg](https://bj.bcebos.com/v1/paddlenlp/models/inbatch_model.zip)|
margin:0.2 scale:30 epoch:3 lr:5E-5 bs:64 max_len:64
|
4卡 v100-16g
|f3e5c7d7b0b718c2530c5e1b136b2d74| +|[batch_neg](https://bj.bcebos.com/v1/paddlenlp/models/inbatch_model.zip)|
ernie 1.0 margin:0.2 scale:30 epoch:3 lr:5E-5 bs:64 max_len:64
|
4卡 v100-16g
|f3e5c7d7b0b718c2530c5e1b136b2d74| -### 训练环境说明 +### 训练环境说明 - NVIDIA Driver Version: 440.64.00 - Ubuntu 16.04.6 LTS (Docker) @@ -185,7 +185,7 @@ Recall@K召回率是指预测的前topK(top-k是指从最后的按得分排序 然后运行下面的命令使用GPU训练,得到语义索引模型: ``` -root_path=recall +root_path=inbatch python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ train_batch_neg.py \ --device gpu \ @@ -194,11 +194,11 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ --learning_rate 5E-5 \ --epochs 3 \ --output_emb_size 256 \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --save_steps 10 \ --max_seq_length 64 \ --margin 0.2 \ --train_set_file recall/train.csv \ - --evaluate \ --recall_result_dir "recall_result_dir" \ --recall_result_file "recall_result.txt" \ --hnsw_m 100 \ @@ -217,6 +217,7 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ * `learning_rate`: 训练的学习率的大小 * `epochs`: 训练的epoch数 * `output_emb_size`: Transformer 顶层输出的文本向量维度 +* `model_name_or_path`: 预训练模型,用于模型和`Tokenizer`的参数初始化 * `save_steps`: 模型存储 checkpoint 的间隔 steps 个数 * `max_seq_length`: 输入序列的最大长度 * `margin`: 正样本相似度与负样本之间的目标 Gap @@ -234,7 +235,7 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ 也可以使用bash脚本: ``` -sh scripts/train_batch_neg.sh +sh scripts/train.sh ``` @@ -270,6 +271,7 @@ python -u -m paddle.distributed.launch --gpus "3" --log_dir "recall_log/" \ --recall_result_dir "recall_result_dir" \ --recall_result_file "recall_result.txt" \ --params_path "${root_dir}/model_40/model_state.pdparams" \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --hnsw_m 100 \ --hnsw_ef 100 \ --batch_size 64 \ @@ -280,16 +282,17 @@ python -u -m paddle.distributed.launch --gpus "3" --log_dir "recall_log/" \ --corpus_file "recall/corpus.csv" ``` 参数含义说明 -* `device`: 使用 cpu/gpu 进行训练 -* `recall_result_dir`: 召回结果存储目录 -* `recall_result_file`: 召回结果的文件名 +* `device`: 使用 cpu/gpu 进行训练 +* `recall_result_dir`: 召回结果存储目录 +* `recall_result_file`: 召回结果的文件名 * `params_path`: 待评估模型的参数文件名 -* `hnsw_m`: hnsw 算法相关参数,保持默认即可 -* `hnsw_ef`: hnsw 算法相关参数,保持默认即可 -* `output_emb_size`: Transformer 顶层输出的文本向量维度 -* `recall_num`: 对 1 个文本召回的相似文本数量 -* `similar_text_pair`: 由相似文本对构成的评估集 -* `corpus_file`: 召回库数据 corpus_file +* `model_name_or_path`: 预训练模型,用于模型和`Tokenizer`的参数初始化 +* `hnsw_m`: hnsw 算法相关参数,保持默认即可 +* `hnsw_ef`: hnsw 算法相关参数,保持默认即可 +* `output_emb_size`: Transformer 顶层输出的文本向量维度 +* `recall_num`: 对 1 个文本召回的相似文本数量 +* `similar_text_pair`: 由相似文本对构成的评估集 +* `corpus_file`: 召回库数据 corpus_file 也可以使用下面的bash脚本: @@ -383,10 +386,11 @@ python inference.py ``` root_dir="checkpoints/inbatch" -python -u -m paddle.distributed.launch --gpus "3" \ +python -u -m paddle.distributed.launch --gpus "0" \ predict.py \ --device gpu \ --params_path "${root_dir}/model_40/model_state.pdparams" \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --output_emb_size 256 \ --batch_size 128 \ --max_seq_length 64 \ @@ -396,6 +400,7 @@ python -u -m paddle.distributed.launch --gpus "3" \ 参数含义说明 * `device`: 使用 cpu/gpu 进行训练 * `params_path`: 预训练模型的参数文件名 +* `model_name_or_path`: 预训练模型,用于模型和`Tokenizer`的参数初始化 * `output_emb_size`: Transformer 顶层输出的文本向量维度 * `text_pair_file`: 由文本 Pair 构成的待预测数据集 @@ -423,7 +428,9 @@ predict.sh文件包含了cpu和gpu运行的脚本,默认是gpu运行的脚本 首先把动态图模型转换为静态图: ``` -python export_model.py --params_path checkpoints/inbatch/model_40/model_state.pdparams --output_path=./output +python export_model.py --params_path checkpoints/inbatch/model_40/model_state.pdparams \ + --model_name_or_path rocketqa-zh-base-query-encoder \ + --output_path=./output ``` 也可以运行下面的bash脚本: @@ -449,7 +456,9 @@ corpus_list=[['中西方语言与文化的差异','中西方文化差异以及 然后使用PaddleInference ``` -python deploy/python/predict.py --model_dir=./output +python deploy/python/predict.py \ + --model_dir=./output \ + --model_name_or_path rocketqa-zh-base-query-encoder ``` 也可以运行下面的bash脚本: @@ -501,9 +510,16 @@ Paddle Serving的部署有两种方式,第一种方式是Pipeline的方式, #### Pipeline方式 -启动 Pipeline Server: +修改模型需要用到的`Tokenizer` + +``` +self.tokenizer = AutoTokenizer.from_pretrained("rocketqa-zh-base-query-encoder") +``` + +然后启动 Pipeline Server: ``` +cd deploy/python python web_service.py ``` @@ -520,7 +536,7 @@ list_data = [ 然后运行: ``` -python rpc_client.py +python deploy/python/rpc_client.py ``` 模型的输出为: @@ -547,12 +563,12 @@ python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_i 也可以使用脚本: ``` -sh deploy/C++/start_server.sh +sh deploy/cpp/start_server.sh ``` Client 可以使用 http 或者 rpc 两种方式,rpc 的方式为: ``` -python deploy/C++/rpc_client.py +python deploy/cpp/rpc_client.py ``` 运行的输出为: ``` @@ -571,7 +587,7 @@ time to cost :0.3960278034210205 seconds 或者使用 http 的客户端访问模式: ``` -python deploy/C++/http_client.py +python deploy/cpp/http_client.py ``` 运行的输出为: @@ -599,6 +615,7 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ train_batch_neg.py \ --device gpu \ --save_dir ./checkpoints/simcse_inbatch_negative \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --batch_size 64 \ --learning_rate 5E-5 \ --epochs 3 \ diff --git a/applications/neural_search/recall/in_batch_negative/deploy/C++/http_client.py b/applications/neural_search/recall/in_batch_negative/deploy/cpp/http_client.py similarity index 97% rename from applications/neural_search/recall/in_batch_negative/deploy/C++/http_client.py rename to applications/neural_search/recall/in_batch_negative/deploy/cpp/http_client.py index 320c97166936..164038f46dab 100644 --- a/applications/neural_search/recall/in_batch_negative/deploy/C++/http_client.py +++ b/applications/neural_search/recall/in_batch_negative/deploy/cpp/http_client.py @@ -54,7 +54,7 @@ def convert_example(example, print(fetch_names) # 创建tokenizer -tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') +tokenizer = AutoTokenizer.from_pretrained('rocketqa-zh-base-query-encoder') max_seq_len = 64 # 数据预处理 diff --git a/applications/neural_search/recall/in_batch_negative/deploy/C++/rpc_client.py b/applications/neural_search/recall/in_batch_negative/deploy/cpp/rpc_client.py similarity index 97% rename from applications/neural_search/recall/in_batch_negative/deploy/C++/rpc_client.py rename to applications/neural_search/recall/in_batch_negative/deploy/cpp/rpc_client.py index 43275e8d7117..1b257b8d2fb7 100644 --- a/applications/neural_search/recall/in_batch_negative/deploy/C++/rpc_client.py +++ b/applications/neural_search/recall/in_batch_negative/deploy/cpp/rpc_client.py @@ -50,7 +50,7 @@ def convert_example(example, print(fetch_names) # 创建tokenizer -tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') +tokenizer = AutoTokenizer.from_pretrained('rocketqa-zh-base-query-encoder') max_seq_len = 64 # 数据预处理 diff --git a/applications/neural_search/recall/in_batch_negative/deploy/C++/start_server.sh b/applications/neural_search/recall/in_batch_negative/deploy/cpp/start_server.sh similarity index 100% rename from applications/neural_search/recall/in_batch_negative/deploy/C++/start_server.sh rename to applications/neural_search/recall/in_batch_negative/deploy/cpp/start_server.sh diff --git a/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py b/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py index 5e592b5c502b..0c2f2209051d 100644 --- a/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py +++ b/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py @@ -40,7 +40,7 @@ help="Batch size per GPU/CPU for training.") parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.") - +parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="model name.") parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False], help='Enable to use tensorrt to speed up.') parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"], @@ -156,22 +156,21 @@ def __init__(self, if args.benchmark: import auto_log pid = os.getpid() - self.autolog = auto_log.AutoLogger(model_name="ernie-3.0-medium-zh", - model_precision=precision, - batch_size=self.batch_size, - data_shape="dynamic", - save_path=args.save_log_path, - inference_config=config, - pids=pid, - process_name=None, - gpu_ids=0, - time_keys=[ - 'preprocess_time', - 'inference_time', - 'postprocess_time' - ], - warmup=0, - logger=logger) + self.autolog = auto_log.AutoLogger( + model_name=args.model_name_or_path, + model_precision=precision, + batch_size=self.batch_size, + data_shape="dynamic", + save_path=args.save_log_path, + inference_config=config, + pids=pid, + process_name=None, + gpu_ids=0, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=0, + logger=logger) def extract_embedding(self, data, tokenizer): """ @@ -279,7 +278,7 @@ def predict(self, data, tokenizer): # ErnieTinyTokenizer is special for ernie-tiny pretained model. output_emb_size = 256 - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'} corpus_list = [{idx: text} for idx, text in id2corpus.items()] res = predictor.extract_embedding(corpus_list, tokenizer) diff --git a/applications/neural_search/recall/in_batch_negative/deploy/python/web_service.py b/applications/neural_search/recall/in_batch_negative/deploy/python/web_service.py index 42be3d9c2029..af239075b0bc 100644 --- a/applications/neural_search/recall/in_batch_negative/deploy/python/web_service.py +++ b/applications/neural_search/recall/in_batch_negative/deploy/python/web_service.py @@ -40,7 +40,8 @@ class ErnieOp(Op): def init_op(self): from paddlenlp.transformers import AutoTokenizer - self.tokenizer = AutoTokenizer.from_pretrained('ernie-1.0') + self.tokenizer = AutoTokenizer.from_pretrained( + "rocketqa-zh-base-query-encoder") def preprocess(self, input_dicts, data_id, log_id): from paddlenlp.data import Stack, Tuple, Pad @@ -56,7 +57,7 @@ def preprocess(self, input_dicts, data_id, log_id): batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64" ), # input - Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64" + Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64" ), # segment ): fn(samples) input_ids, segment_ids = batchify_fn(examples) diff --git a/applications/neural_search/recall/in_batch_negative/evaluate.py b/applications/neural_search/recall/in_batch_negative/evaluate.py index 262dca418bb6..449887306067 100644 --- a/applications/neural_search/recall/in_batch_negative/evaluate.py +++ b/applications/neural_search/recall/in_batch_negative/evaluate.py @@ -76,8 +76,6 @@ def recall(rs, N=10): relevance_labels.append(1) else: relevance_labels.append(0) - # print(len(rs)) - # print(rs[:50]) recall_N = [] recall_num = [1, 5, 10, 20, 50] @@ -92,4 +90,3 @@ def recall(rs, N=10): print('recall@{}={}'.format(key, val)) res.append(str(val)) result.write('\t'.join(res) + '\n') - # print("\t".join(recall_N)) diff --git a/applications/neural_search/recall/in_batch_negative/export_model.py b/applications/neural_search/recall/in_batch_negative/export_model.py index 3b98c4fb9134..b2bed9a0bbcf 100644 --- a/applications/neural_search/recall/in_batch_negative/export_model.py +++ b/applications/neural_search/recall/in_batch_negative/export_model.py @@ -28,6 +28,7 @@ parser = argparse.ArgumentParser() parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_900/model_state.pdparams', help="The path to model parameters to be loaded.") +parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select model to train, defaults to rocketqa-zh-base-query-encoder.") parser.add_argument("--output_path", type=str, default='./output', help="The path of model parameter in static graph to be saved.") args = parser.parse_args() @@ -35,8 +36,8 @@ if __name__ == "__main__": output_emb_size = 256 - pretrained_model = AutoModel.from_pretrained("ernie-1.0") - tokenizer = AutoTokenizer.from_pretrained('ernie-1.0') + pretrained_model = AutoModel.from_pretrained(args.model_name_or_path) + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) model = SemanticIndexBaseStatic(pretrained_model, output_emb_size=output_emb_size) if args.params_path and os.path.isfile(args.params_path): diff --git a/applications/neural_search/recall/in_batch_negative/inference.py b/applications/neural_search/recall/in_batch_negative/inference.py index 5e8b5fc914b3..21bc39b3affa 100644 --- a/applications/neural_search/recall/in_batch_negative/inference.py +++ b/applications/neural_search/recall/in_batch_negative/inference.py @@ -26,9 +26,10 @@ batch_size = 1 params_path = 'checkpoints/inbatch/model_40/model_state.pdparams' id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'} + model_name_or_path = "rocketqa-zh-base-query-encoder" paddle.set_device(device) - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=max_seq_length) @@ -38,7 +39,7 @@ Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment ): [data for data in fn(samples)] - pretrained_model = AutoModel.from_pretrained("ernie-3.0-medium-zh") + pretrained_model = AutoModel.from_pretrained(model_name_or_path) model = SemanticIndexBaseStatic(pretrained_model, output_emb_size=output_emb_size) diff --git a/applications/neural_search/recall/in_batch_negative/predict.py b/applications/neural_search/recall/in_batch_negative/predict.py index 14b73443c1e4..9e30ed94e71b 100644 --- a/applications/neural_search/recall/in_batch_negative/predict.py +++ b/applications/neural_search/recall/in_batch_negative/predict.py @@ -37,6 +37,7 @@ help="The path to model parameters to be loaded.") parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. " "Sequences longer than this will be truncated, sequences shorter will be padded.") +parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select model to train, defaults to rocketqa-zh-base-query-encoder.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--output_emb_size", default=None, @@ -83,7 +84,7 @@ def predict(model, data_loader): if __name__ == "__main__": paddle.set_device(args.device) - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, @@ -107,7 +108,7 @@ def predict(model, data_loader): batchify_fn=batchify_fn, trans_fn=trans_func) - pretrained_model = AutoModel.from_pretrained("ernie-3.0-medium-zh") + pretrained_model = AutoModel.from_pretrained(args.model_name_or_path) model = SemanticIndexBase(pretrained_model, output_emb_size=args.output_emb_size) diff --git a/applications/neural_search/recall/in_batch_negative/recall.py b/applications/neural_search/recall/in_batch_negative/recall.py index 78874fc93eb5..f7c73aabec14 100644 --- a/applications/neural_search/recall/in_batch_negative/recall.py +++ b/applications/neural_search/recall/in_batch_negative/recall.py @@ -55,7 +55,7 @@ type=int, help="output_embedding_size") parser.add_argument("--recall_num", default=10, type=int, help="Recall number for each query from Ann index.") - +parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select model to train, defaults to rocketqa-zh-base-query-encoder.") parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.") parser.add_argument("--hnsw_ef", default=100, type=int, @@ -74,7 +74,7 @@ if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, @@ -87,7 +87,7 @@ ), # text_segment ): [data for data in fn(samples)] - pretrained_model = AutoModel.from_pretrained("ernie-3.0-medium-zh") + pretrained_model = AutoModel.from_pretrained(args.model_name_or_path) model = SemanticIndexBase(pretrained_model, output_emb_size=args.output_emb_size) diff --git a/applications/neural_search/recall/in_batch_negative/scripts/export_model.sh b/applications/neural_search/recall/in_batch_negative/scripts/export_model.sh index f59ecefbfbab..99d01c7b5aae 100644 --- a/applications/neural_search/recall/in_batch_negative/scripts/export_model.sh +++ b/applications/neural_search/recall/in_batch_negative/scripts/export_model.sh @@ -1 +1,3 @@ -python export_model.py --params_path checkpoints/inbatch/model_40/model_state.pdparams --output_path=./output \ No newline at end of file +python export_model.py --params_path checkpoints/inbatch/model_40/model_state.pdparams \ + --model_name_or_path rocketqa-zh-base-query-encoder \ + --output_path=./output \ No newline at end of file diff --git a/applications/neural_search/recall/in_batch_negative/scripts/predict.sh b/applications/neural_search/recall/in_batch_negative/scripts/predict.sh index 5a253520ded0..3967bb2c9b5d 100644 --- a/applications/neural_search/recall/in_batch_negative/scripts/predict.sh +++ b/applications/neural_search/recall/in_batch_negative/scripts/predict.sh @@ -1,10 +1,10 @@ # gpu version - root_dir="checkpoints/inbatch" -python -u -m paddle.distributed.launch --gpus "3" \ +python -u -m paddle.distributed.launch --gpus "0" \ predict.py \ --device gpu \ --params_path "${root_dir}/model_40/model_state.pdparams" \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --output_emb_size 256 \ --batch_size 128 \ --max_seq_length 64 \ diff --git a/applications/neural_search/recall/in_batch_negative/scripts/run_build_index.sh b/applications/neural_search/recall/in_batch_negative/scripts/run_build_index.sh index a9f400dfb401..857302c334a1 100755 --- a/applications/neural_search/recall/in_batch_negative/scripts/run_build_index.sh +++ b/applications/neural_search/recall/in_batch_negative/scripts/run_build_index.sh @@ -6,6 +6,7 @@ python -u -m paddle.distributed.launch --gpus "3" --log_dir "recall_log/" \ --recall_result_dir "recall_result_dir" \ --recall_result_file "recall_result.txt" \ --params_path "${root_dir}/model_40/model_state.pdparams" \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --hnsw_m 100 \ --hnsw_ef 100 \ --batch_size 64 \ diff --git a/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh b/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh deleted file mode 100644 index f493b89b5fc3..000000000000 --- a/applications/neural_search/recall/in_batch_negative/scripts/train_batch_neg.sh +++ /dev/null @@ -1,62 +0,0 @@ -# GPU training -root_path=inbatch -python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ - train_batch_neg.py \ - --device gpu \ - --save_dir ./checkpoints/${root_path} \ - --batch_size 64 \ - --learning_rate 5E-5 \ - --epochs 3 \ - --output_emb_size 256 \ - --save_steps 10 \ - --max_seq_length 64 \ - --margin 0.2 \ - --train_set_file recall/train.csv - - -# cpu training -# root_path=inbatch -# python train_batch_neg.py \ -# --device cpu \ -# --save_dir ./checkpoints/${root_path} \ -# --batch_size 64 \ -# --learning_rate 5E-5 \ -# --epochs 3 \ -# --output_emb_size 256 \ -# --save_steps 10 \ -# --max_seq_length 64 \ -# --margin 0.2 \ -# --train_set_file recall/train.csv - - - -# 加载simcse训练的模型,模型放在simcse/model_20000 -# python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ -# train_batch_neg.py \ -# --device gpu \ -# --save_dir ./checkpoints/simcse_inbatch_negative \ -# --batch_size 64 \ -# --learning_rate 5E-5 \ -# --epochs 3 \ -# --output_emb_size 256 \ -# --save_steps 10 \ -# --max_seq_length 64 \ -# --margin 0.2 \ -# --evaluate \ -# --train_set_file recall/train.csv \ -# --init_from_ckpt simcse/model_20000/model_state.pdparams - -# 加载post training的模型,模型放在simcse/post_model_10000 -# python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ -# train_batch_neg.py \ -# --device gpu \ -# --save_dir ./checkpoints/post_simcse_inbatch_negative \ -# --batch_size 64 \ -# --learning_rate 5E-5 \ -# --epochs 3 \ -# --output_emb_size 256 \ -# --save_steps 10 \ -# --max_seq_length 64 \ -# --margin 0.2 \ -# --train_set_file recall/train.csv \ -# --init_from_ckpt simcse/post_model_10000/model_state.pdparams diff --git a/applications/neural_search/recall/in_batch_negative/train_batch_neg.py b/applications/neural_search/recall/in_batch_negative/train_batch_neg.py index 222b02d16423..10bead311455 100644 --- a/applications/neural_search/recall/in_batch_negative/train_batch_neg.py +++ b/applications/neural_search/recall/in_batch_negative/train_batch_neg.py @@ -37,6 +37,7 @@ "Sequences longer than this will be truncated, sequences shorter will be padded.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") +parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training") parser.add_argument("--output_emb_size", default=256, type=int, help="output_embedding_size") parser.add_argument("--learning_rate", default=5E-5, type=float, @@ -172,9 +173,9 @@ def do_train(): data_path=args.train_set_file, lazy=False) - pretrained_model = AutoModel.from_pretrained('ernie-3.0-medium-zh') + pretrained_model = AutoModel.from_pretrained(args.model_name_or_path) - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, diff --git a/applications/neural_search/recall/simcse/README.md b/applications/neural_search/recall/simcse/README.md index 9090dd6fcc1e..e5be579103fa 100644 --- a/applications/neural_search/recall/simcse/README.md +++ b/applications/neural_search/recall/simcse/README.md @@ -50,10 +50,10 @@ SimCSE 模型适合缺乏监督数据,但是又有大量无监督数据的匹 **效果评估** -| 模型 | Recall@1 | Recall@5 |Recall@10 |Recall@20 |Recall@50 |策略简要说明| +| 策略 | 模型| Recall@1 | Recall@5 |Recall@10 |Recall@20 |Recall@50 | | ------------ | ------------ | ------------ |--------- |--------- |--------- |--------- | -| SimCSE | 42.374 | 57.505| 62.641| 67.09|72.331| SimCSE无监督训练| - +| SimCSE | ernie 1.0 |42.374 | 57.505| 62.641| 67.09|72.331| +| SimCSE | rocketqa-zh-base-query-encoder |**50.108** | **64.005**| **68.288**| **72.306**|**77.306**| @@ -151,14 +151,6 @@ simcse/ ## 5. 模型训练 -**语义索引预训练模型下载链接:** - -以下模型结构参数为: `TrasformerLayer:12, Hidden:768, Heads:12, OutputEmbSize: 256` - -|Model|训练参数配置|硬件|MD5| -| ------------ | ------------ | ------------ |-----------| -|[SimCSE](https://bj.bcebos.com/v1/paddlenlp/models/simcse_model.zip)|
epoch:3 lr:5E-5 bs:64 max_len:64
|
4卡 v100-16g
|7c46d9b15a214292e3897c0eb70d0c9f| - ### 训练环境说明 + NVIDIA Driver Version: 440.64.00 @@ -188,7 +180,8 @@ python -u -m paddle.distributed.launch --gpus '0,1,2,3' \ --dropout 0.2 \ --output_emb_size 256 \ --train_set_file "./recall/train_unsupervised.csv" \ - --test_set_file "./recall/dev.csv" + --test_set_file "./recall/dev.csv" \ + --model_name_or_path "rocketqa-zh-base-query-encoder" ``` 也可以使用bash脚本: @@ -213,6 +206,7 @@ sh scripts/train.sh * `init_from_ckpt`:可选,模型参数路径,热启动模型训练;默认为None。 * `seed`:可选,随机种子,默认为1000. * `device`: 选用什么设备进行训练,可选cpu或gpu。如使用gpu训练则参数gpus指定GPU卡号。 +* `model_name_or_path`: 预训练模型,用于模型和`Tokenizer`的参数初始化。 程序运行时将会自动进行训练,评估。同时训练过程中会自动保存模型在指定的`save_dir`中。 如: @@ -255,7 +249,8 @@ python -u -m paddle.distributed.launch --gpus "6" --log_dir "recall_log/" \ --device gpu \ --recall_result_dir "recall_result_dir" \ --recall_result_file "recall_result.txt" \ - --params_path "checkpoints/model_20000/model_state.pdparams" \ + --params_path "checkpoints/model_12000/model_state.pdparams" \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --hnsw_m 100 \ --hnsw_ef 100 \ --batch_size 64 \ @@ -314,7 +309,7 @@ recall@50=74.848 修改 inference.py 文件里面输入文本 id2corpus 和模型路径 params_path: ``` -params_path='checkpoints/model_20000/model_state.pdparams' +params_path='checkpoints/model_12000/model_state.pdparams' id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'} ``` 然后运行 @@ -352,7 +347,8 @@ root_dir="checkpoints" python -u -m paddle.distributed.launch --gpus "3" \ predict.py \ --device gpu \ - --params_path "${root_dir}/model_20000/model_state.pdparams" \ + --params_path "${root_dir}/model_12000/model_state.pdparams" \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --output_emb_size 256 \ --batch_size 128 \ --max_seq_length 64 \ @@ -362,6 +358,7 @@ python -u -m paddle.distributed.launch --gpus "3" \ 参数含义说明 * `device`: 使用 cpu/gpu 进行训练 * `params_path`: 预训练模型的参数文件名 +* `model_name_or_path`: 预训练模型,用于模型和`Tokenizer`的参数初始化。 * `output_emb_size`: Transformer 顶层输出的文本向量维度 * `text_pair_file`: 由文本 Pair 构成的待预测数据集 @@ -388,7 +385,9 @@ sh scripts/predict.sh 首先把动态图模型转换为静态图: ``` -python export_model.py --params_path checkpoints/model_20000/model_state.pdparams --output_path=./output +python export_model.py --params_path checkpoints/model_12000/model_state.pdparams \ + --model_name_or_path rocketqa-zh-base-query-encoder \ + --output_path=./output ``` 也可以运行下面的bash脚本: diff --git a/applications/neural_search/recall/simcse/deploy/python/predict.py b/applications/neural_search/recall/simcse/deploy/python/predict.py index af5b28f9aa1d..93e1e8106111 100644 --- a/applications/neural_search/recall/simcse/deploy/python/predict.py +++ b/applications/neural_search/recall/simcse/deploy/python/predict.py @@ -40,7 +40,7 @@ help="Batch size per GPU/CPU for training.") parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.") - +parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="model name.") parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False], help='Enable to use tensorrt to speed up.') parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"], @@ -153,22 +153,21 @@ def __init__(self, if args.benchmark: import auto_log pid = os.getpid() - self.autolog = auto_log.AutoLogger(model_name="ernie-3.0-medium-zh", - model_precision=precision, - batch_size=self.batch_size, - data_shape="dynamic", - save_path=args.save_log_path, - inference_config=config, - pids=pid, - process_name=None, - gpu_ids=0, - time_keys=[ - 'preprocess_time', - 'inference_time', - 'postprocess_time' - ], - warmup=0, - logger=logger) + self.autolog = auto_log.AutoLogger( + model_name=args.model_name_or_path, + model_precision=precision, + batch_size=self.batch_size, + data_shape="dynamic", + save_path=args.save_log_path, + inference_config=config, + pids=pid, + process_name=None, + gpu_ids=0, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=0, + logger=logger) def extract_embedding(self, data, tokenizer): """ @@ -276,7 +275,7 @@ def predict(self, data, tokenizer): # ErnieTinyTokenizer is special for ernie-tiny pretained model. output_emb_size = 256 - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'} corpus_list = [{idx: text} for idx, text in id2corpus.items()] res = predictor.extract_embedding(corpus_list, tokenizer) diff --git a/applications/neural_search/recall/simcse/evaluate.py b/applications/neural_search/recall/simcse/evaluate.py index d211cbd3cf6d..b0dc868ba876 100644 --- a/applications/neural_search/recall/simcse/evaluate.py +++ b/applications/neural_search/recall/simcse/evaluate.py @@ -84,4 +84,3 @@ def recall(rs, N=10): print('recall@{}={}'.format(key, val)) res.append(str(val)) result.write('\t'.join(res) + '\n') - # print("\t".join(recall_N)) diff --git a/applications/neural_search/recall/simcse/export_model.py b/applications/neural_search/recall/simcse/export_model.py index c4598fa9acfa..8781877c8bec 100644 --- a/applications/neural_search/recall/simcse/export_model.py +++ b/applications/neural_search/recall/simcse/export_model.py @@ -14,7 +14,6 @@ import argparse import os -from functools import partial import numpy as np import paddle @@ -28,6 +27,7 @@ parser = argparse.ArgumentParser() parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_900/model_state.pdparams', help="The path to model parameters to be loaded.") parser.add_argument("--output_path", type=str, default='./output', help="The path of model parameter in static graph to be saved.") +parser.add_argument("--model_name_or_path",default='rocketqa-zh-base-query-encoder',type=str,help='The pretrained model used for training') args = parser.parse_args() # yapf: enable @@ -35,9 +35,9 @@ # If you want to use ernie1.0 model, plesace uncomment the following code output_emb_size = 256 - pretrained_model = AutoModel.from_pretrained("ernie-3.0-medium-zh") + pretrained_model = AutoModel.from_pretrained(args.model_name_or_path) - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) model = SimCSE(pretrained_model, output_emb_size=output_emb_size) if args.params_path and os.path.isfile(args.params_path): diff --git a/applications/neural_search/recall/simcse/inference.py b/applications/neural_search/recall/simcse/inference.py index 8788b35cf1cc..0e11c6ad65e4 100644 --- a/applications/neural_search/recall/simcse/inference.py +++ b/applications/neural_search/recall/simcse/inference.py @@ -57,9 +57,10 @@ def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False): batch_size = 1 params_path = 'checkpoints/model_20000/model_state.pdparams' id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'} + model_name_or_path = "rocketqa-zh-base-query-encoder" paddle.set_device(device) - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=max_seq_length) @@ -69,7 +70,7 @@ def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False): Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment ): [data for data in fn(samples)] - pretrained_model = AutoModel.from_pretrained("ernie-3.0-medium-zh") + pretrained_model = AutoModel.from_pretrained(model_name_or_path) model = SimCSE(pretrained_model, output_emb_size=output_emb_size) diff --git a/applications/neural_search/recall/simcse/predict.py b/applications/neural_search/recall/simcse/predict.py index 02dc6147de70..9f96660a1cc9 100644 --- a/applications/neural_search/recall/simcse/predict.py +++ b/applications/neural_search/recall/simcse/predict.py @@ -40,6 +40,8 @@ parser.add_argument("--margin", default=0.0, type=float, help="Margin beteween pos_sample and neg_samples.") parser.add_argument("--scale", default=20, type=int, help="Scale for pair-wise margin_rank_loss.") parser.add_argument("--output_emb_size", default=0, type=int, help="Output_embedding_size, 0 means use hidden_size as output embedding size.") +parser.add_argument("--model_name_or_path",default='rocketqa-zh-base-query-encoder',type=str,help='The pretrained model used for training') + args = parser.parse_args() # yapf: enable @@ -80,7 +82,7 @@ def predict(model, data_loader): if __name__ == "__main__": paddle.set_device(args.device) - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, @@ -104,7 +106,7 @@ def predict(model, data_loader): batchify_fn=batchify_fn, trans_fn=trans_func) - pretrained_model = AutoModel.from_pretrained("ernie-3.0-medium-zh") + pretrained_model = AutoModel.from_pretrained(args.model_name_or_path) model = SimCSE(pretrained_model, margin=args.margin, diff --git a/applications/neural_search/recall/simcse/recall.py b/applications/neural_search/recall/simcse/recall.py index 7d3d5e31b8f2..c784878b9e41 100644 --- a/applications/neural_search/recall/simcse/recall.py +++ b/applications/neural_search/recall/simcse/recall.py @@ -47,11 +47,10 @@ parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--output_emb_size", default=None, type=int, help="output_embedding_size") parser.add_argument("--recall_num", default=10, type=int, help="Recall number for each query from Ann index.") - parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.") parser.add_argument("--hnsw_ef", default=100, type=int, help="Recall number for each query from Ann index.") parser.add_argument("--hnsw_max_elements", default=1000000, type=int, help="Recall number for each query from Ann index.") - +parser.add_argument("--model_name_or_path",default='rocketqa-zh-base-query-encoder',type=str,help='The pretrained model used for training') parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") args = parser.parse_args() # yapf: enable @@ -62,7 +61,7 @@ if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example_test, tokenizer=tokenizer, @@ -75,7 +74,7 @@ ), # text_segment ): [data for data in fn(samples)] - pretrained_model = AutoModel.from_pretrained("ernie-3.0-medium-zh") + pretrained_model = AutoModel.from_pretrained(args.model_name_or_path) model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size) model = paddle.DataParallel(model) @@ -107,7 +106,6 @@ final_index = build_index(args, corpus_data_loader, inner_model) text_list, text2similar_text = gen_text_file(args.similar_text_pair_file) - # print(text_list[:5]) query_ds = MapDataset(text_list) diff --git a/applications/neural_search/recall/simcse/scripts/export_model.sh b/applications/neural_search/recall/simcse/scripts/export_model.sh index f011b5fc900b..629440b9b079 100644 --- a/applications/neural_search/recall/simcse/scripts/export_model.sh +++ b/applications/neural_search/recall/simcse/scripts/export_model.sh @@ -1 +1,3 @@ -python export_model.py --params_path checkpoints/model_20000/model_state.pdparams --output_path=./output \ No newline at end of file +python export_model.py --params_path checkpoints/model_12000/model_state.pdparams \ + --model_name_or_path rocketqa-zh-base-query-encoder \ + --output_path=./output \ No newline at end of file diff --git a/applications/neural_search/recall/simcse/scripts/predict.sh b/applications/neural_search/recall/simcse/scripts/predict.sh index 141ea70d1b2d..758e3ecf1696 100644 --- a/applications/neural_search/recall/simcse/scripts/predict.sh +++ b/applications/neural_search/recall/simcse/scripts/predict.sh @@ -3,10 +3,11 @@ root_dir="checkpoints" python -u -m paddle.distributed.launch --gpus "3" \ predict.py \ --device gpu \ - --params_path "${root_dir}/model_20000/model_state.pdparams" \ + --params_path "${root_dir}/model_12000/model_state.pdparams" \ --output_emb_size 256 \ --batch_size 128 \ --max_seq_length 64 \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --text_pair_file "recall/test.csv" # cpu diff --git a/applications/neural_search/recall/simcse/scripts/run_build_index.sh b/applications/neural_search/recall/simcse/scripts/run_build_index.sh index b13fd69ed347..eee1ad359359 100755 --- a/applications/neural_search/recall/simcse/scripts/run_build_index.sh +++ b/applications/neural_search/recall/simcse/scripts/run_build_index.sh @@ -1,10 +1,11 @@ # gpu -python -u -m paddle.distributed.launch --gpus "6" --log_dir "recall_log/" \ +python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \ recall.py \ --device gpu \ --recall_result_dir "recall_result_dir" \ --recall_result_file "recall_result.txt" \ - --params_path "checkpoints/model_20000/model_state.pdparams" \ + --params_path "checkpoints/model_12000/model_state.pdparams" \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --hnsw_m 100 \ --hnsw_ef 100 \ --batch_size 64 \ diff --git a/applications/neural_search/recall/simcse/scripts/train.sh b/applications/neural_search/recall/simcse/scripts/train.sh index 79822037063e..60817e0ff7b5 100644 --- a/applications/neural_search/recall/simcse/scripts/train.sh +++ b/applications/neural_search/recall/simcse/scripts/train.sh @@ -1,5 +1,5 @@ # simcse gpu -python -u -m paddle.distributed.launch --gpus '0,1,2,3' \ +python -u -m paddle.distributed.launch --gpus '1,2,3,4' \ train.py \ --device gpu \ --save_dir ./checkpoints/ \ @@ -14,7 +14,7 @@ python -u -m paddle.distributed.launch --gpus '0,1,2,3' \ --output_emb_size 256 \ --train_set_file "./recall/train_unsupervised.csv" \ --test_set_file "./recall/dev.csv" \ - --model_name_or_path "ernie-3.0-medium-zh" + --model_name_or_path "rocketqa-zh-base-query-encoder" # simcse cpu # python train.py \ diff --git a/applications/neural_search/recall/simcse/train.py b/applications/neural_search/recall/simcse/train.py index d9d49e189529..c28672a79000 100644 --- a/applications/neural_search/recall/simcse/train.py +++ b/applications/neural_search/recall/simcse/train.py @@ -56,7 +56,7 @@ parser.add_argument("--scale", default=20, type=int, help="Scale for pair-wise margin_rank_loss.") parser.add_argument("--dropout", default=0.1, type=float, help="Dropout for pretrained model encoder.") parser.add_argument("--infer_with_fc_pooler", action='store_true', help="Whether use fc layer after cls embedding or not for when infer.") -parser.add_argument("--model_name_or_path",default='ernie-3.0-medium-zh',type=str,help='pretrained model') +parser.add_argument("--model_name_or_path",default='rocketqa-zh-base-query-encoder',type=str,help='The pretrained model used for training') args = parser.parse_args() @@ -84,7 +84,7 @@ def do_train(): hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.dropout) print("loading model from {}".format(args.model_name_or_path)) - tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, diff --git a/pipelines/examples/semantic-search/README.md b/pipelines/examples/semantic-search/README.md index 9ada8926840e..fc5d1076f3e1 100644 --- a/pipelines/examples/semantic-search/README.md +++ b/pipelines/examples/semantic-search/README.md @@ -17,7 +17,7 @@ ## 2. 产品功能介绍 -本项目提供了低成本搭建端到端语义检索系统的能力。用户只需要处理好自己的业务数据,就可以使用本项目预置的语义检索系统模型(召回模型、排序模型)快速搭建一个针对自己业务数据的问答系统,并可以提供 Web 化产品服务。 +本项目提供了低成本搭建端到端语义检索系统的能力。用户只需要处理好自己的业务数据,就可以使用本项目预置的语义检索系统模型(召回模型、排序模型)快速搭建一个针对自己业务数据的问答系统,并可以提供 Web 化产品服务。以下是使用预置模型的教程,如果用户想接入自己训练的模型,可以参考[Neural Search的流程](./Neural_Search.md)。 ### 2.1 系统特色 diff --git a/pipelines/examples/semantic-search/run_neural_search_server.sh b/pipelines/examples/semantic-search/run_neural_search_server.sh new file mode 100644 index 000000000000..3edad2c52818 --- /dev/null +++ b/pipelines/examples/semantic-search/run_neural_search_server.sh @@ -0,0 +1,5 @@ +# 指定语义检索系统的Yaml配置文件 +export CUDA_VISIBLE_DEVICES=0 +export PIPELINE_YAML_PATH=rest_api/pipeline/semantic_search_custom.yaml +# 使用端口号 8891 启动模型服务 +python rest_api/application.py 8891 \ No newline at end of file diff --git a/pipelines/examples/semantic-search/run_search_web.sh b/pipelines/examples/semantic-search/run_search_web.sh index 05530d8779eb..a1273daf018d 100644 --- a/pipelines/examples/semantic-search/run_search_web.sh +++ b/pipelines/examples/semantic-search/run_search_web.sh @@ -1,4 +1,5 @@ unset http_proxy && unset https_proxy +export PYTHONPATH=/wugaosheng/workplace/PaddleNLP/pipelines:$PYTHONPATH # 配置模型服务地址 export API_ENDPOINT=http://127.0.0.1:8891 # 在指定端口 8502 启动 WebUI diff --git a/pipelines/examples/semantic-search/semantic_search_example.py b/pipelines/examples/semantic-search/semantic_search_example.py index c24be521b393..b21b24b7631c 100644 --- a/pipelines/examples/semantic-search/semantic_search_example.py +++ b/pipelines/examples/semantic-search/semantic_search_example.py @@ -13,6 +13,23 @@ parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.") parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.") parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.") +parser.add_argument("--query_embedding_model", + default="rocketqa-zh-nano-query-encoder", + type=str, + help="The query_embedding_model path") + +parser.add_argument("--passage_embedding_model", + default="rocketqa-zh-nano-para-encoder", + type=str, + help="The passage_embedding_model path") +parser.add_argument("--params_path", + default="checkpoints/model_40/model_state.pdparams", + type=str, + help="The checkpoint path") +parser.add_argument("--embedding_dim", + default=312, + type=int, + help="The embedding_dim of index") args = parser.parse_args() # yapf: enable @@ -25,16 +42,29 @@ def semantic_search_tutorial(): if os.path.exists(args.index_name) and os.path.exists(faiss_document_store): # connect to existed FAISS Index document_store = FAISSDocumentStore.load(args.index_name) - retriever = DensePassageRetriever( - document_store=document_store, - query_embedding_model="rocketqa-zh-dureader-query-encoder", - passage_embedding_model="rocketqa-zh-dureader-query-encoder", - max_seq_len_query=args.max_seq_len_query, - max_seq_len_passage=args.max_seq_len_passage, - batch_size=args.retriever_batch_size, - use_gpu=use_gpu, - embed_title=False, - ) + if (os.path.exists(args.params_path)): + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model=args.query_embedding_model, + params_path=args.params_path, + output_emb_size=args.embedding_dim, + max_seq_len_query=args.max_seq_len_query, + max_seq_len_passage=args.max_seq_len_passage, + batch_size=args.retriever_batch_size, + use_gpu=use_gpu, + embed_title=False, + ) + else: + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model=args.query_embedding_model, + passage_embedding_model=args.passage_embedding_model, + max_seq_len_query=args.max_seq_len_query, + max_seq_len_passage=args.max_seq_len_passage, + batch_size=args.retriever_batch_size, + use_gpu=use_gpu, + embed_title=False, + ) else: doc_dir = "data/dureader_dev" dureader_data = "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip" @@ -49,20 +79,33 @@ def semantic_search_tutorial(): if os.path.exists(faiss_document_store): os.remove(faiss_document_store) - document_store = FAISSDocumentStore(embedding_dim=768, + document_store = FAISSDocumentStore(embedding_dim=args.embedding_dim, faiss_index_factory_str="Flat") document_store.write_documents(dicts) - retriever = DensePassageRetriever( - document_store=document_store, - query_embedding_model="rocketqa-zh-dureader-query-encoder", - passage_embedding_model="rocketqa-zh-dureader-query-encoder", - max_seq_len_query=args.max_seq_len_query, - max_seq_len_passage=args.max_seq_len_passage, - batch_size=args.retriever_batch_size, - use_gpu=use_gpu, - embed_title=False, - ) + if (os.path.exists(args.params_path)): + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model=args.query_embedding_model, + params_path=args.params_path, + output_emb_size=args.embedding_dim, + max_seq_len_query=args.max_seq_len_query, + max_seq_len_passage=args.max_seq_len_passage, + batch_size=args.retriever_batch_size, + use_gpu=use_gpu, + embed_title=False, + ) + else: + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model=args.query_embedding_model, + passage_embedding_model=args.passage_embedding_model, + max_seq_len_query=args.max_seq_len_query, + max_seq_len_passage=args.max_seq_len_passage, + batch_size=args.retriever_batch_size, + use_gpu=use_gpu, + embed_title=False, + ) # update Embedding document_store.update_embeddings(retriever) diff --git a/pipelines/pipelines/nodes/models/__init__.py b/pipelines/pipelines/nodes/models/__init__.py new file mode 100644 index 000000000000..231705d96e9d --- /dev/null +++ b/pipelines/pipelines/nodes/models/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pipelines.nodes.models.neural_search_model import SemanticIndexBatchNeg \ No newline at end of file diff --git a/pipelines/pipelines/nodes/models/neural_search_model.py b/pipelines/pipelines/nodes/models/neural_search_model.py new file mode 100644 index 000000000000..fd77f2a42ce0 --- /dev/null +++ b/pipelines/pipelines/nodes/models/neural_search_model.py @@ -0,0 +1,151 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import abc + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class SemanticIndexBase(nn.Layer): + + def __init__(self, pretrained_model, dropout=None, output_emb_size=None): + super().__init__() + self.ptm = pretrained_model + self.dropout = nn.Dropout(dropout if dropout is not None else 0.1) + + # if output_emb_size is not None, then add Linear layer to reduce embedding_size, + # we recommend set output_emb_size = 256 considering the trade-off beteween + # recall performance and efficiency + + self.output_emb_size = output_emb_size + if output_emb_size > 0: + weight_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.TruncatedNormal(std=0.02)) + self.emb_reduce_linear = paddle.nn.Linear(768, + output_emb_size, + weight_attr=weight_attr) + + def get_pooled_embedding(self, + input_ids, + token_type_ids=None, + position_ids=None, + attention_mask=None): + _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, + attention_mask) + + if self.output_emb_size > 0: + cls_embedding = self.emb_reduce_linear(cls_embedding) + cls_embedding = self.dropout(cls_embedding) + cls_embedding = F.normalize(cls_embedding, p=2, axis=-1) + + return cls_embedding + + def get_semantic_embedding(self, data_loader): + self.eval() + with paddle.no_grad(): + for batch_data in data_loader: + input_ids, token_type_ids = batch_data + + text_embeddings = self.get_pooled_embedding( + input_ids, token_type_ids=token_type_ids) + + yield text_embeddings + + def cosine_sim(self, + query_input_ids, + title_input_ids, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None): + + query_cls_embedding = self.get_pooled_embedding(query_input_ids, + query_token_type_ids, + query_position_ids, + query_attention_mask) + + title_cls_embedding = self.get_pooled_embedding(title_input_ids, + title_token_type_ids, + title_position_ids, + title_attention_mask) + + cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, + axis=-1) + return cosine_sim + + @abc.abstractmethod + def forward(self): + pass + + +class SemanticIndexBatchNeg(SemanticIndexBase): + + def __init__(self, + pretrained_model, + dropout=None, + margin=0.3, + scale=30, + output_emb_size=None): + super().__init__(pretrained_model, dropout, output_emb_size) + + self.margin = margin + # Used scaling cosine similarity to ease converge + self.sacle = scale + + def forward(self, + query_input_ids, + title_input_ids, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None): + + query_cls_embedding = self.get_pooled_embedding(query_input_ids, + query_token_type_ids, + query_position_ids, + query_attention_mask) + + title_cls_embedding = self.get_pooled_embedding(title_input_ids, + title_token_type_ids, + title_position_ids, + title_attention_mask) + + cosine_sim = paddle.matmul(query_cls_embedding, + title_cls_embedding, + transpose_y=True) + + # substract margin from all positive samples cosine_sim() + margin_diag = paddle.full(shape=[query_cls_embedding.shape[0]], + fill_value=self.margin, + dtype=paddle.get_default_dtype()) + + cosine_sim = cosine_sim - paddle.diag(margin_diag) + + # scale cosine to ease training converge + cosine_sim *= self.sacle + + labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64') + labels = paddle.reshape(labels, shape=[-1, 1]) + + loss = F.cross_entropy(input=cosine_sim, label=labels) + + return loss diff --git a/pipelines/pipelines/nodes/retriever/dense.py b/pipelines/pipelines/nodes/retriever/dense.py index 728e8d400490..2baeac9ee0a0 100644 --- a/pipelines/pipelines/nodes/retriever/dense.py +++ b/pipelines/pipelines/nodes/retriever/dense.py @@ -14,6 +14,7 @@ # limitations under the License. from typing import List, Dict, Union, Optional +import os import logging import numpy as np @@ -22,11 +23,12 @@ import paddle from paddlenlp.data import Stack, Tuple, Pad -from paddlenlp.transformers import ErnieDualEncoder, AutoTokenizer +from paddlenlp.transformers import ErnieDualEncoder, AutoTokenizer, AutoModel from pipelines.schema import Document from pipelines.document_stores import BaseDocumentStore from pipelines.nodes.retriever.base import BaseRetriever +from pipelines.nodes.models import SemanticIndexBatchNeg from pipelines.data_handler.processor import TextSimilarityProcessor from pipelines.utils.common_utils import initialize_device_settings @@ -45,7 +47,9 @@ def __init__( Path, str] = "rocketqa-zh-dureader-query-encoder", passage_embedding_model: Union[ Path, str] = "rocketqa-zh-dureader-para-encoder", + params_path: Optional[str] = None, model_version: Optional[str] = None, + output_emb_size=256, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, @@ -133,12 +137,25 @@ def __init__( "This can be set when initializing the DocumentStore") # Init & Load Encoders - self.ernie_dual_encoder = ErnieDualEncoder(query_embedding_model, - passage_embedding_model) - self.query_tokenizer = AutoTokenizer.from_pretrained( - query_embedding_model) - self.passage_tokenizer = AutoTokenizer.from_pretrained( - passage_embedding_model) + if (os.path.exists(params_path)): + pretrained_model = AutoModel.from_pretrained(query_embedding_model) + self.ernie_dual_encoder = SemanticIndexBatchNeg( + pretrained_model, output_emb_size=output_emb_size) + # Load Custom models + print("Loading Parameters from:{}".format(params_path)) + state_dict = paddle.load(params_path) + self.ernie_dual_encoder.set_dict(state_dict) + self.query_tokenizer = AutoTokenizer.from_pretrained( + query_embedding_model) + self.passage_tokenizer = AutoTokenizer.from_pretrained( + query_embedding_model) + else: + self.ernie_dual_encoder = ErnieDualEncoder(query_embedding_model, + passage_embedding_model) + self.query_tokenizer = AutoTokenizer.from_pretrained( + query_embedding_model) + self.passage_tokenizer = AutoTokenizer.from_pretrained( + passage_embedding_model) self.processor = TextSimilarityProcessor( query_tokenizer=self.query_tokenizer, diff --git a/pipelines/rest_api/pipeline/semantic_search.yaml b/pipelines/rest_api/pipeline/semantic_search.yaml index 855e4811ef3f..3367384da48c 100644 --- a/pipelines/rest_api/pipeline/semantic_search.yaml +++ b/pipelines/rest_api/pipeline/semantic_search.yaml @@ -6,7 +6,7 @@ components: # define all the building-blocks for Pipeline params: host: localhost port: 9200 - index: dureader_robust_query_encoder + index: dureader_robust_nano_encoder embedding_dim: 312 - name: Retriever type: DensePassageRetriever diff --git a/pipelines/rest_api/pipeline/semantic_search_custom.yaml b/pipelines/rest_api/pipeline/semantic_search_custom.yaml new file mode 100644 index 000000000000..b0aab6ac5a99 --- /dev/null +++ b/pipelines/rest_api/pipeline/semantic_search_custom.yaml @@ -0,0 +1,67 @@ +version: '1.1.0' + +components: # define all the building-blocks for Pipeline + - name: DocumentStore + type: ElasticsearchDocumentStore # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents + params: + host: localhost + port: 9200 + index: dureader_robust_neural_search + embedding_dim: 256 + - name: Retriever + type: DensePassageRetriever + params: + document_store: DocumentStore # params can reference other components defined in the YAML + top_k: 10 + query_embedding_model: rocketqa-zh-base-query-encoder + params_path: checkpoints/model_40/model_state.pdparams + output_emb_size: 256 + embed_title: False + - name: Ranker # custom-name for the component; helpful for visualization & debugging + type: ErnieRanker # pipelines Class name for the component + params: + model_name_or_path: rocketqa-nano-cross-encoder + top_k: 3 + - name: TextFileConverter + type: TextConverter + - name: ImageFileConverter + type: ImageToTextConverter + - name: PDFFileConverter + type: PDFToTextConverter + - name: DocxFileConverter + type: DocxToTextConverter + - name: Preprocessor + type: PreProcessor + params: + split_by: word + split_length: 1000 + - name: FileTypeClassifier + type: FileTypeClassifier + +pipelines: + - name: query # a sample extractive-qa Pipeline + type: Query + nodes: + - name: Retriever + inputs: [Query] + - name: Ranker + inputs: [Retriever] + - name: indexing + type: Indexing + nodes: + - name: FileTypeClassifier + inputs: [File] + - name: TextFileConverter + inputs: [FileTypeClassifier.output_1] + - name: PDFFileConverter + inputs: [FileTypeClassifier.output_2] + - name: DocxFileConverter + inputs: [FileTypeClassifier.output_4] + - name: ImageFileConverter + inputs: [FileTypeClassifier.output_6] + - name: Preprocessor + inputs: [PDFFileConverter, TextFileConverter, DocxFileConverter, ImageFileConverter] + - name: Retriever + inputs: [Preprocessor] + - name: DocumentStore + inputs: [Retriever] diff --git a/pipelines/utils/offline_ann.py b/pipelines/utils/offline_ann.py index 2241f585fc0d..373f57d5eda2 100644 --- a/pipelines/utils/offline_ann.py +++ b/pipelines/utils/offline_ann.py @@ -1,4 +1,5 @@ import argparse +import os import paddle from pipelines.utils import convert_files_to_dicts, fetch_archive_from_http @@ -51,6 +52,11 @@ type=str, help="The passage_embedding_model path") +parser.add_argument("--params_path", + default="checkpoints/model_40/model_state.pdparams", + type=str, + help="The checkpoint path") + parser.add_argument( '--delete_index', action='store_true', @@ -82,16 +88,30 @@ def offline_ann(index_name, doc_dir): document_store.write_documents(dicts) ### 语义索引模型 - retriever = DensePassageRetriever( - document_store=document_store, - query_embedding_model=args.query_embedding_model, - passage_embedding_model=args.passage_embedding_model, - max_seq_len_query=64, - max_seq_len_passage=256, - batch_size=16, - use_gpu=True, - embed_title=False, - ) + if (os.path.exists(args.params_path)): + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model=args.query_embedding_model, + params_path=args.params_path, + output_emb_size=args.embedding_dim, + max_seq_len_query=64, + max_seq_len_passage=256, + batch_size=16, + use_gpu=True, + embed_title=False, + ) + + else: + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model=args.query_embedding_model, + passage_embedding_model=args.passage_embedding_model, + max_seq_len_query=64, + max_seq_len_passage=256, + batch_size=16, + use_gpu=True, + embed_title=False, + ) # 建立索引库 document_store.update_embeddings(retriever) From 3c8ad6bf9508fddc4286a9e33d5b071701d0be73 Mon Sep 17 00:00:00 2001 From: w5688414 Date: Thu, 1 Sep 2022 03:14:18 +0000 Subject: [PATCH 2/3] Adjust the format --- .../neural_search/recall/in_batch_negative/recall.py | 2 +- applications/neural_search/recall/simcse/README.md | 8 ++++++++ pipelines/examples/semantic-search/run_search_web.sh | 1 - pipelines/rest_api/pipeline/semantic_search.yaml | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/applications/neural_search/recall/in_batch_negative/recall.py b/applications/neural_search/recall/in_batch_negative/recall.py index f7c73aabec14..c0f3c64398bd 100644 --- a/applications/neural_search/recall/in_batch_negative/recall.py +++ b/applications/neural_search/recall/in_batch_negative/recall.py @@ -55,7 +55,7 @@ type=int, help="output_embedding_size") parser.add_argument("--recall_num", default=10, type=int, help="Recall number for each query from Ann index.") -parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select model to train, defaults to rocketqa-zh-base-query-encoder.") +parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training") parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.") parser.add_argument("--hnsw_ef", default=100, type=int, diff --git a/applications/neural_search/recall/simcse/README.md b/applications/neural_search/recall/simcse/README.md index e5be579103fa..033afd18008f 100644 --- a/applications/neural_search/recall/simcse/README.md +++ b/applications/neural_search/recall/simcse/README.md @@ -151,6 +151,14 @@ simcse/ ## 5. 模型训练 +**语义索引预训练模型下载链接:** + +以下模型结构参数为: `TrasformerLayer:12, Hidden:768, Heads:12, OutputEmbSize: 256` + +|Model|训练参数配置|硬件|MD5| +| ------------ | ------------ | ------------ |-----------| +|[SimCSE](https://bj.bcebos.com/v1/paddlenlp/models/simcse_model.zip)|
ernie 1.0 epoch:3 lr:5E-5 bs:64 max_len:64
|
4卡 v100-16g
|7c46d9b15a214292e3897c0eb70d0c9f| + ### 训练环境说明 + NVIDIA Driver Version: 440.64.00 diff --git a/pipelines/examples/semantic-search/run_search_web.sh b/pipelines/examples/semantic-search/run_search_web.sh index a1273daf018d..05530d8779eb 100644 --- a/pipelines/examples/semantic-search/run_search_web.sh +++ b/pipelines/examples/semantic-search/run_search_web.sh @@ -1,5 +1,4 @@ unset http_proxy && unset https_proxy -export PYTHONPATH=/wugaosheng/workplace/PaddleNLP/pipelines:$PYTHONPATH # 配置模型服务地址 export API_ENDPOINT=http://127.0.0.1:8891 # 在指定端口 8502 启动 WebUI diff --git a/pipelines/rest_api/pipeline/semantic_search.yaml b/pipelines/rest_api/pipeline/semantic_search.yaml index 3367384da48c..855e4811ef3f 100644 --- a/pipelines/rest_api/pipeline/semantic_search.yaml +++ b/pipelines/rest_api/pipeline/semantic_search.yaml @@ -6,7 +6,7 @@ components: # define all the building-blocks for Pipeline params: host: localhost port: 9200 - index: dureader_robust_nano_encoder + index: dureader_robust_query_encoder embedding_dim: 312 - name: Retriever type: DensePassageRetriever From 73a970e2ae6e1eaa0349a18bbf75886c7509e335 Mon Sep 17 00:00:00 2001 From: w5688414 Date: Sat, 3 Sep 2022 09:30:12 +0000 Subject: [PATCH 3/3] Update Neural Search Recall and Upgrade docx for Pipelines --- .../recall/in_batch_negative/predict.py | 14 +- .../neural_search/recall/milvus/README.md | 1 + .../recall/milvus/feature_extract.py | 3 +- .../neural_search/recall/milvus/inference.py | 5 +- .../recall/milvus/scripts/feature_extract.sh | 1 + .../Install_windows.md | 100 +++++++++++ .../examples/semantic-search/Neural_Search.md | 163 ++++++++++++++++++ .../pipelines/nodes/file_converter/docx.py | 61 ++++--- pipelines/pipelines/nodes/retriever/dense.py | 2 +- 9 files changed, 317 insertions(+), 33 deletions(-) create mode 100644 pipelines/examples/frequently-asked-question/Install_windows.md create mode 100644 pipelines/examples/semantic-search/Neural_Search.md diff --git a/applications/neural_search/recall/in_batch_negative/predict.py b/applications/neural_search/recall/in_batch_negative/predict.py index 9e30ed94e71b..0337b49aa1bd 100644 --- a/applications/neural_search/recall/in_batch_negative/predict.py +++ b/applications/neural_search/recall/in_batch_negative/predict.py @@ -37,7 +37,7 @@ help="The path to model parameters to be loaded.") parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. " "Sequences longer than this will be truncated, sequences shorter will be padded.") -parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select model to train, defaults to rocketqa-zh-base-query-encoder.") +parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training") parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--output_emb_size", default=None, @@ -92,10 +92,14 @@ def predict(model, data_loader): pad_to_max_seq_len=args.pad_to_max_seq_len) batchify_fn = lambda samples, fn=Tuple( - Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input - Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment - Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input - Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment + Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' + ), # query_input + Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' + ), # query_segment + Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' + ), # title_input + Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' + ), # tilte_segment ): [data for data in fn(samples)] valid_ds = load_dataset(read_text_pair, diff --git a/applications/neural_search/recall/milvus/README.md b/applications/neural_search/recall/milvus/README.md index e0b40a99636b..de3f1666b960 100644 --- a/applications/neural_search/recall/milvus/README.md +++ b/applications/neural_search/recall/milvus/README.md @@ -104,6 +104,7 @@ Milvus 搭建完系统以后就可以插入和检索向量了,首先生成 emb ``` CUDA_VISIBLE_DEVICES=0 python feature_extract.py \ --model_dir=./output \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --corpus_file "data/milvus_data.csv" ``` 其中 output 目录下存放的是召回的 Paddle Inference 静态图模型。 diff --git a/applications/neural_search/recall/milvus/feature_extract.py b/applications/neural_search/recall/milvus/feature_extract.py index d50e8ca25b57..bea355b3af0c 100644 --- a/applications/neural_search/recall/milvus/feature_extract.py +++ b/applications/neural_search/recall/milvus/feature_extract.py @@ -50,6 +50,7 @@ help='Number of threads to predict when using cpu.') parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.') +parser.add_argument("--model_name_or_path",default='rocketqa-zh-base-query-encoder',type=str,help='The pretrained model used for training') args = parser.parse_args() # yapf: enable @@ -173,7 +174,7 @@ def read_text(file_path): args.batch_size, args.use_tensorrt, args.precision, args.cpu_threads, args.enable_mkldnn) - tokenizer = AutoTokenizer.from_pretrained('ernie-1.0') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) id2corpus = read_text(args.corpus_file) corpus_list = [{idx: text} for idx, text in id2corpus.items()] diff --git a/applications/neural_search/recall/milvus/inference.py b/applications/neural_search/recall/milvus/inference.py index 1ce0737982a3..7966ed696bf8 100644 --- a/applications/neural_search/recall/milvus/inference.py +++ b/applications/neural_search/recall/milvus/inference.py @@ -54,8 +54,9 @@ def search_in_milvus(text_embedding): batch_size = 1 params_path = 'checkpoints/model_40/model_state.pdparams' id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'} + model_name_or_path = "rocketqa-zh-base-query-encoder" paddle.set_device(device) - tokenizer = AutoTokenizer.from_pretrained('ernie-1.0') + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=max_seq_length) @@ -65,7 +66,7 @@ def search_in_milvus(text_embedding): Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64" ), # text_segment ): [data for data in fn(samples)] - pretrained_model = AutoModel.from_pretrained("ernie-1.0") + pretrained_model = AutoModel.from_pretrained(model_name_or_path) model = SemanticIndexBaseStatic(pretrained_model, output_emb_size=output_emb_size) # Load pretrained semantic model diff --git a/applications/neural_search/recall/milvus/scripts/feature_extract.sh b/applications/neural_search/recall/milvus/scripts/feature_extract.sh index eb1cf52f0dfe..7f996ac0600a 100644 --- a/applications/neural_search/recall/milvus/scripts/feature_extract.sh +++ b/applications/neural_search/recall/milvus/scripts/feature_extract.sh @@ -1,5 +1,6 @@ CUDA_VISIBLE_DEVICES=2 python feature_extract.py \ --model_dir ./output \ + --model_name_or_path rocketqa-zh-base-query-encoder \ --batch_size 512 \ --corpus_file "milvus/milvus_data.csv" diff --git a/pipelines/examples/frequently-asked-question/Install_windows.md b/pipelines/examples/frequently-asked-question/Install_windows.md new file mode 100644 index 000000000000..30236378799f --- /dev/null +++ b/pipelines/examples/frequently-asked-question/Install_windows.md @@ -0,0 +1,100 @@ +# WINDOWS环境下搭建端到端FAQ智能问答系统 +以下的流程都是使用的Anaconda的环境进行的搭建,Anaconda安装好以后,进入 `Anaconda Powershell Prompt`(由于环境变量设置不兼容的原因,暂不支持使用`cmd`执行下面的命令),然后执行下面的流程。 + +## 1. 快速开始: 快速搭建FAQ智能问答系统 + +### 1.1 运行环境和安装说明 + +a. 依赖安装: +我们预置了基于[ 8000 多条保险行业问答数据](https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/baoxianzhidao/intro.ipynb)搭建保险FAQ智能问答的代码示例,您可以通过如下命令快速体验智能问答的效果 +```bash + +git clone https://github.com/tvst/htbuilder.git +cd htbuilder/ +python setup install +# 1) 安装 pipelines package +cd ${HOME}/PaddleNLP/pipelines/ +pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +python setup.py install +``` +【注意】以下的所有的流程都只需要在`pipelines`根目录下进行,不需要跳转目录 +### 1.2 数据说明 +我们预置了基于[ 8000 多条保险行业问答数据](https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/baoxianzhidao/intro.ipynb)搭建保险FAQ智能问答的代码示例,您可以通过如下命令快速体验智能问答的效果 + +### 1.3 一键体验FAQ智能问答系统 + +```bash +# 我们建议在 GPU 环境下运行本示例,运行速度较快 +# 设置 1 个空闲的 GPU 卡,此处假设 0 卡为空闲 GPU +export CUDA_VISIBLE_DEVICES=0 +python examples/frequently-asked-question/dense_faq_example.py --device gpu +# 如果只有 CPU 机器,可以通过 --device 参数指定 cpu 即可, 运行耗时较长 +unset CUDA_VISIBLE_DEVICES +python examples/frequently-asked-question/dense_faq_example.py --device cpu +``` + +### 1.4 构建 Web 可视化FAQ系统 + +整个 Web 可视化FAQ智能问答系统主要包含 3 大组件: 1. 基于 ElasticSearch 的 ANN 服务 2. 基于 RestAPI 构建模型服务 3. 基于 Streamlit 构建 WebUI,接下来我们依次搭建这 3 个服务并最终形成可视化的FAQ智能问答系统。 + +#### 1.4.1 启动 ANN 服务 +1. 参考官方文档下载安装 [elasticsearch-8.3.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。 +2. 启动 ES 服务 +把`xpack.security.enabled` 设置成false,如下: +``` +xpack.security.enabled: false +``` + +然后直接双击bin目录下的elasticsearch.bat即可启动。 + +3. elasticsearch可视化工具Kibana(可选) +为了更好的对数据进行管理,可以使用Kibana可视化工具进行管理和分析,下载链接为[Kibana](https://www.elastic.co/cn/downloads/kibana),下载完后解压,直接双击运行 `bin\kibana.bat`即可。 + +#### 1.4.2 文档数据写入 ANN 索引库 +``` +# 以DuReader-Robust 数据集为例建立 ANN 索引库 +python utils/offline_ann.py --index_name insurance --doc_dir data/insurance --split_answers --delete_index +``` +参数含义说明 +* `index_name`: 索引的名称 +* `doc_dir`: txt文本数据的路径 +* `host`: Elasticsearch的IP地址 +* `port`: Elasticsearch的端口号 +* `delete_index`: 是否删除现有的索引和数据,用于清空es的数据,默认为false + + +运行结束后,可使用Kibana查看数据 + +#### 1.4.3 启动 RestAPI 模型服务 +```bash +# 指定FAQ智能问答系统的Yaml配置文件 +$env:PIPELINE_YAML_PATH='rest_api/pipeline/dense_faq.yaml' +# 使用端口号 8891 启动模型服务 +python rest_api/application.py 8891 +``` + +#### 1.4.4 启动 WebUI +```bash +# 配置模型服务地址 +$env:API_ENDPOINT='http://127.0.0.1:8891' +# 在指定端口 8502 启动 WebUI +python -m streamlit run ui/webapp_faq.py --server.port 8502 +``` + +到这里您就可以打开浏览器访问 http://127.0.0.1:8502 地址体验FAQ智能问答系统服务了。 + +#### 1.4.5 数据更新 + +数据更新的方法有两种,第一种使用前面的 `utils/offline_ann.py`进行数据更新,另一种是使用前端界面的文件上传进行数据更新,支持txt,pdf,image,word的格式,以txt格式的文件为例,每段文本需要使用空行隔开,程序会根据空行进行分段建立索引,示例数据如下(demo.txt): + +``` +兴证策略认为,最恐慌的时候已经过去,未来一个月市场迎来阶段性修复窗口。 + +从海外市场表现看, +对俄乌冲突的恐慌情绪已显著释放, +海外权益市场也从单边下跌转入双向波动。 + +长期,继续聚焦科技创新的五大方向。1)新能源(新能源汽车、光伏、风电、特高压等),2)新一代信息通信技术(人工智能、大数据、云计算、5G等),3)高端制造(智能数控机床、机器人、先进轨交装备等),4)生物医药(创新药、CXO、医疗器械和诊断设备等),5)军工(导弹设备、军工电子元器件、空间站、航天飞机等)。 +``` + +如果安装遇见问题可以查看[FAQ文档](../../FAQ.md) diff --git a/pipelines/examples/semantic-search/Neural_Search.md b/pipelines/examples/semantic-search/Neural_Search.md new file mode 100644 index 000000000000..ac68a0c47cca --- /dev/null +++ b/pipelines/examples/semantic-search/Neural_Search.md @@ -0,0 +1,163 @@ +# Neural Search + +## 1. 快速开始: 快速搭建语义检索系统 + + +### 1.1 运行环境和安装说明 + +本实验采用了以下的运行环境进行,详细说明如下,用户也可以在自己 GPU 硬件环境进行: + +a. 软件环境: +- python >= 3.7.0 +- paddlenlp >= 2.2.1 +- paddlepaddle-gpu >=2.3 +- CUDA Version: 10.2 +- NVIDIA Driver Version: 440.64.00 +- Ubuntu 16.04.6 LTS (Docker) + +b. 硬件环境: + +- NVIDIA Tesla V100 16GB x4卡 +- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz + +c. 依赖安装: +首先需要安装PaddlePaddle,PaddlePaddle的安装请参考文档[官方安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html),然后安装下面的依赖: +```bash +pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +# 1) 安装 pipelines package +cd ${HOME}/PaddleNLP/pipelines/ +python setup.py install +``` +【注意】以下的所有的流程都只需要在`pipelines`根目录下进行,不需要跳转目录 + +### 1.2 数据说明 +语义检索数据库的数据来自于[DuReader-Robust数据集](https://github.com/baidu/DuReader/tree/master/DuReader-Robust),共包含 46972 个段落文本,并选取了其中验证集1417条段落文本来搭建语义检索系统。 + +### 1.3 一键体验语义检索系统 + +#### 1.3.1 快速一键启动 + +我们预置了基于[DuReader-Robust数据集](https://github.com/baidu/DuReader/tree/master/DuReader-Robust)搭建语义检索系统的代码示例,您可以通过如下命令快速体验语义检索系统的效果 +```bash +# 我们建议在 GPU 环境下运行本示例,运行速度较快 +# 设置 1 个空闲的 GPU 卡,此处假设 0 卡为空闲 GPU +export CUDA_VISIBLE_DEVICES=0 +python examples/semantic-search/semantic_search_example.py \ + --device gpu \ + --query_embedding_model rocketqa-zh-base-query-encoder \ + --params_path checkpoints/model_40/model_state.pdparams \ + --embedding_dim 256 +# 如果只有 CPU 机器,可以通过 --device 参数指定 cpu 即可, 运行耗时较长 +unset CUDA_VISIBLE_DEVICES +python examples/semantic-search/semantic_search_example.py \ + --device cpu \ + --query_embedding_model rocketqa-zh-base-query-encoder \ + --params_path checkpoints/model_40/model_state.pdparams \ + --embedding_dim 256 +``` + +### 1.4 构建 Web 可视化语义检索系统 + +整个 Web 可视化语义检索系统主要包含 3 大组件: 1. 基于 ElasticSearch 的 ANN 服务 2. 基于 RestAPI 构建模型服务 3. 基于 Streamlit 构建 WebUI,接下来我们依次搭建这 3 个服务并最终形成可视化的语义检索系统。 + +#### 1.4.1 启动 ANN 服务 +1. 参考官方文档下载安装 [elasticsearch-8.3.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。 +2. 启动 ES 服务 +首先修改`config/elasticsearch.yml`的配置: +``` +xpack.security.enabled: false +``` +然后启动: +```bash +./bin/elasticsearch +``` +3. 检查确保 ES 服务启动成功 +```bash +curl http://localhost:9200/_aliases?pretty=true +``` +备注:ES 服务默认开启端口为 9200 + +#### 1.4.2 文档数据写入 ANN 索引库 +``` +# 以DuReader-Robust 数据集为例建立 ANN 索引库 +python utils/offline_ann.py --index_name dureader_robust_neural_search \ + --doc_dir data/dureader_dev \ + --query_embedding_model rocketqa-zh-base-query-encoder \ + --params_path checkpoints/model_40/model_state.pdparams \ + --embedding_dim 256 \ + --delete_index +``` +可以使用下面的命令来查看数据: + +``` +# 打印几条数据 +curl http://localhost:9200/dureader_robust_neural_search/_search +``` + +参数含义说明 +* `index_name`: 索引的名称 +* `doc_dir`: txt文本数据的路径 +* `host`: Elasticsearch的IP地址 +* `port`: Elasticsearch的端口号 +* `delete_index`: 是否删除现有的索引和数据,用于清空es的数据,默认为false + +#### 1.4.3 启动 RestAPI 模型服务 +```bash +# 指定语义检索系统的Yaml配置文件 +export PIPELINE_YAML_PATH=rest_api/pipeline/semantic_search_custom.yaml +# 使用端口号 8891 启动模型服务 +python rest_api/application.py 8891 +``` +Linux 用户推荐采用 Shell 脚本来启动服务:: + +```bash +sh examples/semantic-search/run_neural_search_server.sh +``` +启动后可以使用curl命令验证是否成功运行: + +``` +curl -X POST -k http://localhost:8891/query -H 'Content-Type: application/json' -d '{"query": "衡量酒水的价格的因素有哪些?","params": {"Retriever": {"top_k": 5}, "Ranker":{"top_k": 5}}}' + +``` +#### 1.4.4 启动 WebUI +```bash +# 配置模型服务地址 +export API_ENDPOINT=http://127.0.0.1:8891 +# 在指定端口 8502 启动 WebUI +python -m streamlit run ui/webapp_semantic_search.py --server.port 8502 +``` +Linux 用户推荐采用 Shell 脚本来启动服务:: + +```bash +sh examples/semantic-search/run_search_web.sh +``` + +到这里您就可以打开浏览器访问 http://127.0.0.1:8502 地址体验语义检索系统服务了。 + +#### 1.4.5 数据更新 + +数据更新的方法有两种,第一种使用前面的 `utils/offline_ann.py`进行数据更新,另一种是使用前端界面的文件上传进行数据更新,支持txt,pdf,image,word的格式,以txt格式的文件为例,每段文本需要使用空行隔开,程序会根据空行进行分段建立索引,示例数据如下(demo.txt): + +``` +兴证策略认为,最恐慌的时候已经过去,未来一个月市场迎来阶段性修复窗口。 + +从海外市场表现看, +对俄乌冲突的恐慌情绪已显著释放, +海外权益市场也从单边下跌转入双向波动。 + +长期,继续聚焦科技创新的五大方向。1)新能源(新能源汽车、光伏、风电、特高压等),2)新一代信息通信技术(人工智能、大数据、云计算、5G等),3)高端制造(智能数控机床、机器人、先进轨交装备等),4)生物医药(创新药、CXO、医疗器械和诊断设备等),5)军工(导弹设备、军工电子元器件、空间站、航天飞机等)。 +``` +如果安装遇见问题可以查看[FAQ文档](../../FAQ.md) + +## Reference +[1]Y. Sun et al., “[ERNIE 3.0: Large-scale Knowledge Enhanced Pre-training for Language Understanding and Generation](https://arxiv.org/pdf/2107.02137.pdf),” arXiv:2107.02137 [cs], Jul. 2021, Accessed: Jan. 17, 2022. [Online]. Available: http://arxiv.org/abs/2107.02137 + +[2]Y. Qu et al., “[RocketQA: An Optimized Training Approach to Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2010.08191),” arXiv:2010.08191 [cs], May 2021, Accessed: Aug. 16, 2021. [Online]. Available: http://arxiv.org/abs/2010.08191 + +[3]H. Tang, H. Li, J. Liu, Y. Hong, H. Wu, and H. Wang, “[DuReader_robust: A Chinese Dataset Towards Evaluating Robustness and Generalization of Machine Reading Comprehension in Real-World Applications](https://arxiv.org/pdf/2004.11142.pdf).” arXiv, Jul. 21, 2021. Accessed: May 15, 2022. [Online]. Available: http://arxiv.org/abs/2004.11142 + +## Acknowledge + +我们借鉴了 Deepset.ai [Haystack](https://github.com/deepset-ai/haystack) 优秀的框架设计,在此对[Haystack](https://github.com/deepset-ai/haystack)作者及其开源社区表示感谢。 + +We learn form the excellent framework design of Deepset.ai [Haystack](https://github.com/deepset-ai/haystack), and we would like to express our thanks to the authors of Haystack and their open source community. diff --git a/pipelines/pipelines/nodes/file_converter/docx.py b/pipelines/pipelines/nodes/file_converter/docx.py index e2adc5a54a4e..3d036a3ada39 100644 --- a/pipelines/pipelines/nodes/file_converter/docx.py +++ b/pipelines/pipelines/nodes/file_converter/docx.py @@ -52,7 +52,7 @@ def __init__( in garbled text. """ - # save init parameters to enable export of component config as YAML + # Save init parameters to enable export of component config as YAML self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) @@ -109,38 +109,51 @@ def convert( for i in range(len(file.paragraphs)): paragraph = file.paragraphs[i] # Extracting text from the paragraph - if (paragraph.text): - if bool(text_dict): - # The texts and corresponding images will be added into documents + # If there is text, Adding the text to text_dict + if (paragraph.text != ""): + text = paragraph.text + if (bool(text_dict) == False): + text_dict = {'text': [text], 'images': []} + else: + text_dict['text'].append(text) + # Extracting images from the paragraph + else: + image_list = self.get_image_list(file, paragraph) + # If there are not text and images, adding text_dict to documents + if (image_list is None and bool(text_dict)): + raw_text = ''.join(text_dict['text']) + # If the extracted text is "", skip it + if (raw_text == ''): + continue meta_data = {} meta_data['name'] = meta['name'] meta_data['images'] = text_dict['images'] document = { - "content": text_dict['text'], + "content": raw_text, "content_type": "text", "meta": meta_data } documents.append(document) - # Storing new paragraph text into a new dict - text = paragraph.text - text_dict = {'text': text, 'images': []} - # Extracting images from the paragraph - else: - image_list = self.get_image_list(file, paragraph) - if (image_list is None): + + text = paragraph.text + text_dict = {'text': [text], 'images': []} + # If there are images, adding image to text_dict + elif (image_list is not None): + for i, image in enumerate(image_list): + if image: + # File extension & file content + ext, blob = image.ext, image.blob + # Using md5 to generate image name and save image into desc_path + md5hash = hashlib.md5(blob) + md5_name = md5hash.hexdigest() + image_name = '{}_{}.{}'.format(md5_name, i, ext) + image_path = os.path.join(self.desc_path, + image_name) + Image.open(BytesIO(blob)).save(image_path) + # Adding image_name into the text_dict as the image for the text + text_dict['images'].append(image_name) + else: continue - for i, image in enumerate(image_list): - if image: - # File extension & file content - ext, blob = image.ext, image.blob - # Using md5 to generate image name and save image into desc_path - md5hash = hashlib.md5(blob) - md5_name = md5hash.hexdigest() - image_name = '{}_{}.{}'.format(md5_name, i, ext) - image_path = os.path.join(self.desc_path, image_name) - Image.open(BytesIO(blob)).save(image_path) - # Adding image_name into the text_dict as the image for the text - text_dict['images'].append(image_name) return documents def get_image_list(self, document: Document, paragraph: Paragraph): diff --git a/pipelines/pipelines/nodes/retriever/dense.py b/pipelines/pipelines/nodes/retriever/dense.py index 2baeac9ee0a0..6040938faf29 100644 --- a/pipelines/pipelines/nodes/retriever/dense.py +++ b/pipelines/pipelines/nodes/retriever/dense.py @@ -47,7 +47,7 @@ def __init__( Path, str] = "rocketqa-zh-dureader-query-encoder", passage_embedding_model: Union[ Path, str] = "rocketqa-zh-dureader-para-encoder", - params_path: Optional[str] = None, + params_path: Optional[str] = "", model_version: Optional[str] = None, output_emb_size=256, max_seq_len_query: int = 64,