diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md index 5998c0cf9..cd77ab738 100644 --- a/inference/benchmarks/vit_l_16/README.md +++ b/inference/benchmarks/vit_l_16/README.md @@ -83,4 +83,5 @@ find ./val -name "*JPEG" | wc -l | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | | tensorrt | fp16 | 64 |1009.7 | 777.8 | 796.7 | 825.8 | 1329.2 | 26.2% | 79.0/79.3 | 35.0/40.0 | | tensorrt | fp32 | 32 | 1275.9 | 482.4 | 491.1 | 555.5 | 590.5 | 23.3% | 79.3/79.3 | 35.0/40.0 | +| kunlunxin_xtcl | W32A16 | 32 | 2118.307 | / | / | 130.006 | 144.914 | 27.9% | 79.3/79.3 | / | diff --git a/inference/configs/host.yaml b/inference/configs/host.yaml index f5ec9d0ac..25c7f796b 100644 --- a/inference/configs/host.yaml +++ b/inference/configs/host.yaml @@ -13,4 +13,4 @@ PIP_SOURCE: "https://mirror.baidu.com/pypi/simple" CLEAR_CACHES: True ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES" CASES: - "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val" \ No newline at end of file + "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val" diff --git a/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml b/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml new file mode 100644 index 000000000..bf71dd82c --- /dev/null +++ b/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml @@ -0,0 +1,5 @@ +compiler: xtcl +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: true +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: /home/FlagPerf/inference/onnxs/vit_l_16_bs32_pytorch_fp16False.onnx diff --git a/inference/docker_images/kunlunxin/kunlunxin_analysis.py b/inference/docker_images/kunlunxin/kunlunxin_analysis.py index 388f89cee..be1a60b1b 100644 --- a/inference/docker_images/kunlunxin/kunlunxin_analysis.py +++ b/inference/docker_images/kunlunxin/kunlunxin_analysis.py @@ -1,23 +1,21 @@ -def analysis_log(logpath): - logfile = open(logpath) - - max_usage = 0.0 ## usage_mem - max_mem = 0.0 - for line in logfile.readlines(): - ''' - xpu_smi temp power mem w_mem use_rate - ''' - if "xpu_smi" in line: - line = line[:-1] - usage = line.split(" ")[4] - usage = float(usage) - max_usage = max(max_usage, usage) - max_mem = line.split(" ")[5] - max_mem = float(max_mem) - - return round(max_usage / 1024.0, - 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12") - - -if __name__ == "__main__": - max1, max2, max2,max4 = analysis_log("/home/zhoujiamin01/workspace/zjm_flag/FlagPerf/inference/result/run20230809192313/resnet50:pytorch_1.13/127.0.0.1_noderank0/kunlunxin_monitor.log") +def analysis_log(logpath): + logfile = open(logpath) + + max_usage = 0.0 ## usage_mem + max_mem = 0.0 + for line in logfile.readlines(): + ''' + xpu_smi temp power mem w_mem use_rate + ''' + if "xpu_smi" in line: + line = line[:-1] + usage = line.split(" ")[4] + usage = float(usage) + max_usage = max(max_usage, usage) + max_mem = line.split(" ")[5] + max_mem = float(max_mem) + + return round(max_usage / 1024.0, + 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12") + + diff --git a/inference/docker_images/kunlunxin/kunlunxin_monitor.py b/inference/docker_images/kunlunxin/kunlunxin_monitor.py index ba5a877a1..7d31179ae 100644 --- a/inference/docker_images/kunlunxin/kunlunxin_monitor.py +++ b/inference/docker_images/kunlunxin/kunlunxin_monitor.py @@ -1,256 +1,257 @@ -# !/usr/bin/env python3 -# encoding: utf-8 -''' -Usage: python3 sys-monitor.py -o operation -l [log_path] - -o, --operation start|stop|restart|status - -l, --log log path , ./logs/ default -''' - -import os -import sys -import time -import signal -import atexit -import argparse -import datetime -from multiprocessing import Process -import subprocess -import schedule - - -class Daemon: - ''' - daemon subprocess class. - usage: subclass this daemon and override the run() method. - sys-monitor.pid: in the /tmp/, auto del when unexpected exit. - verbose: debug mode, disabled default. - ''' - - def __init__(self, - pid_file, - log_file, - err_file, - gpu_log, - log_path, - rate=5, - stdin=os.devnull, - stdout=os.devnull, - stderr=os.devnull, - home_dir='.', - umask=0o22, - verbose=0): - self.stdin = stdin - self.stdout = stdout - self.stderr = stderr - self.home_dir = home_dir - self.verbose = verbose - self.pidfile = pid_file - self.logfile = log_file - self.errfile = err_file - self.gpufile = gpu_log - self.logpath = log_path - self.rate = rate - self.umask = umask - self.verbose = verbose - self.daemon_alive = True - - def get_pid(self): - try: - with open(self.pidfile, 'r') as pf: - pid = int(pf.read().strip()) - except IOError: - pid = None - except SystemExit: - pid = None - return pid - - def del_pid(self): - if os.path.exists(self.pidfile): - os.remove(self.pidfile) - - def run(self): - ''' - NOTE: override the method in subclass - ''' - - def gpu_mon(file): - TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') - cmd = "xpu_smi |grep '/dev/xpu0'|awk '{print $29,$27,$22,$24,$14}'" ## temp power mem w_mem use_rate - process = subprocess.Popen(cmd, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - encoding='utf-8') - try: - out = process.communicate(timeout=10) - except subprocess.TimeoutExpired: - process.kill() - out = process.communicate() - - if process.returncode != 0: - result = "error" - result = TIMESTAMP + "\n xpu_smi " + out[0] + "\n" - with open(file, 'a') as f: - f.write(result) - - def timer_gpu_mon(): - gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) - gpu_process.start() - - schedule.every(self.rate).seconds.do(timer_gpu_mon) - while True: - schedule.run_pending() - time.sleep(5) - - def daemonize(self): - if self.verbose >= 1: - print('daemon process starting ...') - try: - pid = os.fork() - if pid > 0: - sys.exit(0) - except OSError as e: - sys.stderr.write('fork #1 failed: %d (%s)\n' % - (e.errno, e.strerror)) - sys.exit(1) - os.chdir(self.home_dir) - os.setsid() - os.umask(self.umask) - try: - pid = os.fork() - if pid > 0: - sys.exit(0) - except OSError as e: - sys.stderr.write('fork #2 failed: %d (%s)\n' % - (e.errno, e.strerror)) - sys.exit(1) - sys.stdout.flush() - sys.stderr.flush() - si = open(self.stdin, 'r') - so = open(self.stdout, 'a+') - if self.stderr: - se = open(self.stderr, 'a+') - else: - se = so - os.dup2(si.fileno(), sys.stdin.fileno()) - os.dup2(so.fileno(), sys.stdout.fileno()) - os.dup2(se.fileno(), sys.stderr.fileno()) - atexit.register(self.del_pid) - pid = str(os.getpid()) - with open(self.pidfile, 'w+') as f: - f.write('%s\n' % pid) - - def start(self): - if not os.path.exists(self.logpath): - os.makedirs(self.logpath) - elif os.path.exists(self.gpufile): - os.remove(self.gpufile) - if self.verbose >= 1: - print('ready to start ......') - # check for a pid file to see if the daemon already runs - pid = self.get_pid() - if pid: - msg = 'pid file %s already exists, is it already running?\n' - sys.stderr.write(msg % self.pidfile) - sys.exit(1) - # start the daemon - self.daemonize() - self.run() - - def stop(self): - if self.verbose >= 1: - print('stopping ...') - pid = self.get_pid() - if not pid: - msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile - sys.stderr.write(msg) - if os.path.exists(self.pidfile): - os.remove(self.pidfile) - return - # try to kill the daemon process - try: - i = 0 - while 1: - os.kill(pid, signal.SIGTERM) - time.sleep(1) - i = i + 1 - if i % 10 == 0: - os.kill(pid, signal.SIGHUP) - except OSError as err: - err = str(err) - if err.find('No such process') > 0: - if os.path.exists(self.pidfile): - os.remove(self.pidfile) - else: - print(str(err)) - sys.exit(1) - if self.verbose >= 1: - print('Stopped!') - - def restart(self): - self.stop() - self.start() - - def status(self): - pid = self.get_pid() - if pid: - if os.path.exists('/proc/%d' % pid): - return pid - return False - - -def parse_args(): - ''' Check script input parameter. ''' - parse = argparse.ArgumentParser(description='Sys monitor script') - parse.add_argument('-o', - type=str, - metavar='[operation]', - required=True, - help='start|stop|restart|status') - parse.add_argument('-l', - type=str, - metavar='[log_path]', - required=False, - default='./logs/', - help='log path') - args = parse.parse_args() - return args - - -def main(): - sample_rate1 = 5 - args = parse_args() - operation = args.o - log_path = args.l - pid_fn = str('/tmp/xpu_monitor.pid') - log_fn = str(log_path + '/kunlunxin_monitor.log') - err_fn = str(log_path + '/kunlunxin_monitor.err') - # result for gpu - gpu_fn = str(log_path + '/kunlunxin_monitor.log') - - subdaemon = Daemon(pid_fn, - log_fn, - err_fn, - gpu_fn, - log_path, - verbose=1, - rate=sample_rate1) - if operation == 'start': - subdaemon.start() - elif operation == 'stop': - subdaemon.stop() - elif operation == 'restart': - subdaemon.restart() - elif operation == 'status': - pid = subdaemon.status() - if pid: - print('process [%s] is running ......' % pid) - else: - print('daemon process [%s] stopped' % pid) - else: - print("invalid argument!") - sys.exit(1) - - -if __name__ == '__main__': - main() +# !/usr/bin/env python3 +# encoding: utf-8 +''' +Usage: python3 sys-monitor.py -o operation -l [log_path] + -o, --operation start|stop|restart|status + -l, --log log path , ./logs/ default +''' + +import os +import sys +import time +import signal +import atexit +import argparse +import datetime +from multiprocessing import Process +import subprocess +import schedule + + +class Daemon: + ''' + daemon subprocess class. + usage: subclass this daemon and override the run() method. + sys-monitor.pid: in the /tmp/, auto del when unexpected exit. + verbose: debug mode, disabled default. + ''' + + def __init__(self, + pid_file, + log_file, + err_file, + gpu_log, + log_path, + rate=5, + stdin=os.devnull, + stdout=os.devnull, + stderr=os.devnull, + home_dir='.', + umask=0o22, + verbose=0): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.home_dir = home_dir + self.verbose = verbose + self.pidfile = pid_file + self.logfile = log_file + self.errfile = err_file + self.gpufile = gpu_log + self.logpath = log_path + self.rate = rate + self.umask = umask + self.verbose = verbose + self.daemon_alive = True + + def get_pid(self): + try: + with open(self.pidfile, 'r') as pf: + pid = int(pf.read().strip()) + except IOError: + pid = None + except SystemExit: + pid = None + return pid + + def del_pid(self): + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + + def run(self): + ''' + NOTE: override the method in subclass + ''' + + def gpu_mon(file): + TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + cmd = "xpu_smi |grep '/dev/xpu0'|awk '{print $29,$27,$22,$24,$14}'" ## temp power mem w_mem use_rate + process = subprocess.Popen(cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding='utf-8') + try: + out = process.communicate(timeout=10) + except subprocess.TimeoutExpired: + process.kill() + out = process.communicate() + + if process.returncode != 0: + result = "error" + result = TIMESTAMP + "\n xpu_smi " + out[0] + "\n" + with open(file, 'a') as f: + f.write(result) + + def timer_gpu_mon(): + gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) + gpu_process.start() + + schedule.every(self.rate).seconds.do(timer_gpu_mon) + while True: + schedule.run_pending() + time.sleep(5) + + def daemonize(self): + if self.verbose >= 1: + print('daemon process starting ...') + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #1 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + os.chdir(self.home_dir) + os.setsid() + os.umask(self.umask) + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #2 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + sys.stdout.flush() + sys.stderr.flush() + si = open(self.stdin, 'r') + so = open(self.stdout, 'a+') + if self.stderr: + se = open(self.stderr, 'a+') + else: + se = so + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + atexit.register(self.del_pid) + pid = str(os.getpid()) + with open(self.pidfile, 'w+') as f: + f.write('%s\n' % pid) + + def start(self): + if not os.path.exists(self.logpath): + os.makedirs(self.logpath) + elif os.path.exists(self.gpufile): + os.remove(self.gpufile) + if self.verbose >= 1: + print('ready to start ......') + # check for a pid file to see if the daemon already runs + pid = self.get_pid() + if pid: + msg = 'pid file %s already exists, is it already running?\n' + sys.stderr.write(msg % self.pidfile) + sys.exit(1) + # start the daemon + self.daemonize() + self.run() + + def stop(self): + if self.verbose >= 1: + print('stopping ...') + pid = self.get_pid() + if not pid: + msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile + sys.stderr.write(msg) + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + return + # try to kill the daemon process + try: + i = 0 + while 1: + os.kill(pid, signal.SIGTERM) + time.sleep(1) + i = i + 1 + if i % 10 == 0: + os.kill(pid, signal.SIGHUP) + except OSError as err: + err = str(err) + if err.find('No such process') > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print(str(err)) + sys.exit(1) + if self.verbose >= 1: + print('Stopped!') + + def restart(self): + self.stop() + self.start() + + def status(self): + pid = self.get_pid() + if pid: + if os.path.exists('/proc/%d' % pid): + return pid + return False + + +def parse_args(): + ''' Check script input parameter. ''' + parse = argparse.ArgumentParser(description='Sys monitor script') + parse.add_argument('-o', + type=str, + metavar='[operation]', + required=True, + help='start|stop|restart|status') + parse.add_argument('-l', + type=str, + metavar='[log_path]', + required=False, + default='./logs/', + help='log path') + args = parse.parse_args() + return args + + +def main(): + sample_rate1 = 5 + args = parse_args() + operation = args.o + log_path = args.l + pid_fn = str('/tmp/xpu_monitor.pid') + log_fn = str(log_path + '/kunlunxin_monitor.log') + err_fn = str(log_path + '/kunlunxin_monitor.err') + # result for gpu + gpu_fn = str(log_path + '/kunlunxin_monitor.log') + + subdaemon = Daemon(pid_fn, + log_fn, + err_fn, + gpu_fn, + log_path, + verbose=1, + rate=sample_rate1) + if operation == 'start': + subdaemon.start() + elif operation == 'stop': + subdaemon.stop() + elif operation == 'restart': + subdaemon.restart() + elif operation == 'status': + pid = subdaemon.status() + if pid: + print('process [%s] is running ......' % pid) + else: + print('daemon process [%s] stopped' % pid) + else: + print("invalid argument!") + sys.exit(1) + + +if __name__ == '__main__': + main() + diff --git a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile index 7227b9743..fa778e7e8 100644 --- a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile +++ b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile @@ -72,6 +72,7 @@ ENV TVM_DIR=/root/XTCL-ubuntu_x86_64 + ENV PATH /root/xre-ubuntu_2004_x86_64/bin:$PATH ENV PATH /root/miniconda/envs/python38/bin:$PATH diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py index 396cc3ae9..db8540eff 100755 --- a/inference/inference_engine/kunlunxin/xtcl.py +++ b/inference/inference_engine/kunlunxin/xtcl.py @@ -27,7 +27,7 @@ def build_engine(self, config, onnx_path): input_name = input.name #'inputs:0' self.input_names.append(input_name) shape_dict[input_name] = input_shape - + mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}' @@ -68,3 +68,4 @@ def __call__(self, model_inputs: list): return output_list, foo_time + diff --git a/inference/run.py b/inference/run.py index a11fa4824..36cf49222 100644 --- a/inference/run.py +++ b/inference/run.py @@ -272,7 +272,7 @@ def start_monitors_in_cluster(dp_path, case_log_dir, nnodes): ven_mon_path = os.path.join(dp_path, "docker_images", config.VENDOR, config.VENDOR + "_monitor.py") - start_mon_cmd = "cd " + dp_path + " && " + sys.executable \ + start_mon_cmd = "cd " + dp_path + " && sudo " + sys.executable \ + " " + ven_mon_path + " -o restart -l " logger.debug("Run cmd in the cluster to start vendor's monitors: " + start_mon_cmd) @@ -299,7 +299,7 @@ def stop_monitors_in_cluster(dp_path, nnodes): ven_mon_path = os.path.join(dp_path, "docker_images", config.VENDOR, config.VENDOR + "_monitor.py") - stop_mon_cmd = "cd " + dp_path + " && " + sys.executable \ + stop_mon_cmd = "cd " + dp_path + " && sudo " + sys.executable \ + " " + ven_mon_path + " -o stop" logger.debug("Run cmd in the cluster to stop vendor's monitors: " + stop_mon_cmd)