diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md index f84a474eb..349240525 100644 --- a/inference/benchmarks/bertLarge/README.md +++ b/inference/benchmarks/bertLarge/README.md @@ -40,6 +40,24 @@ bert_reference_results_text_md5.txt - TensorRT 8.5.1.7 +#### 2.2 昆仑芯R200 + +- ##### 硬件环境 + - 机器、加速卡型号: R200 + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.15.0-56-generic + - 加速卡驱动版本:4.0 + - Docker 版本:20.10.21 + - 依赖软件版本: + - pytorch: 1.13.0+cpu + - onnx: 1.14.0 + +- 推理工具包 + + - XTCL 2.1 + ### 4. 运行情况(BERT-Large) * 指标列表 @@ -64,5 +82,5 @@ bert_reference_results_text_md5.txt | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | | tensorrt | fp16 | 32 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 45.3% | 0.600/0.638 | 17.4/40.0 | | tensorrt | fp32 | 32 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 42.0% | 0.638/0.638 | 16.9/40.0 | - +| kunlunxin_xtcl| W32A16 | 32 |3867.6 | None | None | 93.8 | 124.9 | None | 0.638/0.638| None| diff --git a/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml new file mode 100644 index 000000000..c29b9c46b --- /dev/null +++ b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml @@ -0,0 +1,3 @@ +compiler: xtcl +no_validation: true +exist_onnx_path: onnxs/bertLarge/bertLarge_bs32_pytorch_fp16False.onnx diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py index 396cc3ae9..2643f51d5 100755 --- a/inference/inference_engine/kunlunxin/xtcl.py +++ b/inference/inference_engine/kunlunxin/xtcl.py @@ -3,7 +3,8 @@ import tvm.relay as relay from tvm.contrib.download import download_testdata from tvm.relay import param_dict -from tvm.contrib import xpu_config +from tvm.contrib import graph_executor, xpu_config +from tvm.runtime.vm import VirtualMachine import torch import os import subprocess @@ -11,8 +12,10 @@ import numpy as np import time +USE_VM_COMPILE = False + class InferModel: - + def __init__(self, config , onnx_path, model): self.input_names = [] self.engine = self.build_engine(config, onnx_path) @@ -27,7 +30,7 @@ def build_engine(self, config, onnx_path): input_name = input.name #'inputs:0' self.input_names.append(input_name) shape_dict[input_name] = input_shape - + mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}' @@ -44,21 +47,32 @@ def build_engine(self, config, onnx_path): config_var_dtype_map=input_fp16, ).value() else: ## fp32 - os.environ['XTCL_USE_FP16'] = '0' - os.environ['XTCL_QUANTIZE_WEIGHT'] = '0' + os.environ['XTCL_USE_FP16'] = '1' + os.environ['XTCL_QUANTIZE_WEIGHT'] = '1' with tvm.transform.PassContext(opt_level=3, config=build_config): - vm_exec = relay.backend.vm.compile(mod, - target=target_host, - target_host=target_host, - params=params) - from tvm.runtime.vm import VirtualMachine - vm = VirtualMachine(vm_exec, ctx) - return vm + if USE_VM_COMPILE: + vm_exec = relay.backend.vm.compile(mod, + target=target_host, + target_host=target_host, + params=params) + + vm = VirtualMachine(vm_exec, ctx) + return vm + else: + graph, lib, params = relay.build(mod, + target="xpu -libs=xdnn -split-device-funcs -device-type=xpu2", + params=params) + m = graph_executor.create(graph, lib, ctx) + m.set_input(**params) + return m def __call__(self, model_inputs: list): for index, input_name in enumerate(self.input_names): - self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index])) + if USE_VM_COMPILE: + self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index])) + else: + self.engine.set_input(input_name, tvm.nd.array(model_inputs[index])) self.engine.run() output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())] foo_time_start = time.time()