FlagOpen · upvenly · Sep 14, 2023 · Aug 24, 2023 · Aug 24, 2023 · Aug 24, 2023
diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md
@@ -40,6 +40,24 @@ bert_reference_results_text_md5.txt
 
    - TensorRT 8.5.1.7
 
+#### 2.2 昆仑芯R200
+
+- ##### 硬件环境
+    - 机器、加速卡型号: R200
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-56-generic
+   - 加速卡驱动版本：4.0
+   - Docker 版本：20.10.21
+   - 依赖软件版本：
+     - pytorch: 1.13.0+cpu
+     - onnx: 1.14.0
+
+- 推理工具包
+
+   - XTCL 2.1
+
 ### 4. 运行情况（BERT-Large）
 
 * 指标列表
@@ -64,5 +82,5 @@ bert_reference_results_text_md5.txt
 | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
 | tensorrt | fp16      | 32 | 1283.9   | 257.3       | 260.4      | 408.3         | 418.1          | 45.3% | 0.600/0.638 | 17.4/40.0 |
 | tensorrt | fp32   | 32 | 1868.8   | 150.4       | 152.2      | 190.4         | 194.1       | 42.0% | 0.638/0.638 | 16.9/40.0 |
-
+| kunlunxin_xtcl| W32A16   | 32 |3867.6 | None          | None       | 93.8          | 124.9          | None | 0.638/0.638| None| 
 
diff --git a/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,3 @@
+compiler: xtcl
+no_validation: true
+exist_onnx_path: onnxs/bertLarge/bertLarge_bs32_pytorch_fp16False.onnx
diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py
@@ -3,16 +3,19 @@
 import tvm.relay as relay
 from tvm.contrib.download import download_testdata
 from tvm.relay import param_dict
-from tvm.contrib import xpu_config
+from tvm.contrib import graph_executor, xpu_config
+from tvm.runtime.vm import VirtualMachine
 import torch
 import os
 import subprocess
 from loguru import logger
 import numpy as np
 import time
 
+USE_VM_COMPILE = False
+
 class InferModel:
-    
+
     def __init__(self, config , onnx_path, model):
         self.input_names = []
         self.engine = self.build_engine(config, onnx_path)
@@ -27,7 +30,7 @@ def build_engine(self, config, onnx_path):
             input_name = input.name #'inputs:0'
             self.input_names.append(input_name)
             shape_dict[input_name] = input_shape
-        
+
         mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
 
         target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}'
@@ -44,21 +47,32 @@ def build_engine(self, config, onnx_path):
                                                  config_var_dtype_map=input_fp16,
                                                  ).value()
         else: ## fp32
-            os.environ['XTCL_USE_FP16'] = '0'
-            os.environ['XTCL_QUANTIZE_WEIGHT'] = '0'
+            os.environ['XTCL_USE_FP16'] = '1'
+            os.environ['XTCL_QUANTIZE_WEIGHT'] = '1'
 
         with tvm.transform.PassContext(opt_level=3, config=build_config):
-            vm_exec = relay.backend.vm.compile(mod,
-                                             target=target_host,
-                                             target_host=target_host,
-                                             params=params)
-        from tvm.runtime.vm import VirtualMachine
-        vm = VirtualMachine(vm_exec, ctx)
-        return vm
+            if USE_VM_COMPILE:
+                vm_exec = relay.backend.vm.compile(mod,
+                                                target=target_host,
+                                                target_host=target_host,
+                                                params=params)
+
+                vm = VirtualMachine(vm_exec, ctx)
+                return vm
+            else:
+                graph, lib, params = relay.build(mod,
+                                                target="xpu -libs=xdnn -split-device-funcs -device-type=xpu2",
+                                                params=params)
+                m = graph_executor.create(graph, lib, ctx)
+                m.set_input(**params)
+                return m
 
     def __call__(self, model_inputs: list):
         for index, input_name in enumerate(self.input_names):
-            self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index]))
+            if USE_VM_COMPILE:
+                self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index]))
+            else:
+                self.engine.set_input(input_name, tvm.nd.array(model_inputs[index]))
         self.engine.run()
         output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())]
         foo_time_start = time.time()