Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kunlunxin berfLarge inference configs && results #212

Merged
merged 7 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion inference/benchmarks/bertLarge/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,24 @@ bert_reference_results_text_md5.txt

- TensorRT 8.5.1.7

#### 2.2 昆仑芯R200

- ##### 硬件环境
- 机器、加速卡型号: R200

- ##### 软件环境
- OS版本:Ubuntu 20.04
- OS kernel版本: 5.15.0-56-generic
- 加速卡驱动版本:4.0
- Docker 版本:20.10.21
- 依赖软件版本:
- pytorch: 1.13.0+cpu
- onnx: 1.14.0

- 推理工具包

- XTCL 2.1

### 4. 运行情况(BERT-Large)

* 指标列表
Expand All @@ -64,5 +82,5 @@ bert_reference_results_text_md5.txt
| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
| tensorrt | fp16 | 32 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 45.3% | 0.600/0.638 | 17.4/40.0 |
| tensorrt | fp32 | 32 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 42.0% | 0.638/0.638 | 16.9/40.0 |

| kunlunxin_xtcl| W32A16 | 32 |3867.6 | None | None | 93.8 | 124.9 | None | 0.638/0.638| None|

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
compiler: xtcl
no_validation: true
exist_onnx_path: onnxs/bertLarge/bertLarge_bs32_pytorch_fp16False.onnx
40 changes: 27 additions & 13 deletions inference/inference_engine/kunlunxin/xtcl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@
import tvm.relay as relay
from tvm.contrib.download import download_testdata
from tvm.relay import param_dict
from tvm.contrib import xpu_config
from tvm.contrib import graph_executor, xpu_config
from tvm.runtime.vm import VirtualMachine
import torch
import os
import subprocess
from loguru import logger
import numpy as np
import time

USE_VM_COMPILE = False

class InferModel:

def __init__(self, config , onnx_path, model):
self.input_names = []
self.engine = self.build_engine(config, onnx_path)
Expand All @@ -27,7 +30,7 @@ def build_engine(self, config, onnx_path):
input_name = input.name #'inputs:0'
self.input_names.append(input_name)
shape_dict[input_name] = input_shape

mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)

target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}'
Expand All @@ -44,21 +47,32 @@ def build_engine(self, config, onnx_path):
config_var_dtype_map=input_fp16,
).value()
else: ## fp32
os.environ['XTCL_USE_FP16'] = '0'
os.environ['XTCL_QUANTIZE_WEIGHT'] = '0'
os.environ['XTCL_USE_FP16'] = '1'
os.environ['XTCL_QUANTIZE_WEIGHT'] = '1'

with tvm.transform.PassContext(opt_level=3, config=build_config):
vm_exec = relay.backend.vm.compile(mod,
target=target_host,
target_host=target_host,
params=params)
from tvm.runtime.vm import VirtualMachine
vm = VirtualMachine(vm_exec, ctx)
return vm
if USE_VM_COMPILE:
vm_exec = relay.backend.vm.compile(mod,
target=target_host,
target_host=target_host,
params=params)

vm = VirtualMachine(vm_exec, ctx)
return vm
else:
graph, lib, params = relay.build(mod,
target="xpu -libs=xdnn -split-device-funcs -device-type=xpu2",
params=params)
m = graph_executor.create(graph, lib, ctx)
m.set_input(**params)
return m

def __call__(self, model_inputs: list):
for index, input_name in enumerate(self.input_names):
self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index]))
if USE_VM_COMPILE:
self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index]))
else:
self.engine.set_input(input_name, tvm.nd.array(model_inputs[index]))
self.engine.run()
output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())]
foo_time_start = time.time()
Expand Down