From 1545a07916063483d3c6b8da49e3e9ba6171326a Mon Sep 17 00:00:00 2001 From: Minhae Ye Date: Mon, 28 Aug 2023 11:58:28 +0900 Subject: [PATCH 1/6] Fix torchvision warning ``` The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=MobileNet_V2_Weights.IMAGENET1K_V1`. You can also use `weights=MobileNet_V2_Weights.DEFAULT` to get the most up-to-date weights. ``` --- experiments/amp_nchw_autotvm/run.py | 2 +- experiments/amp_nhwc_autoscheduler/run.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/experiments/amp_nchw_autotvm/run.py b/experiments/amp_nchw_autotvm/run.py index 1c69b94dad53..e055c5a202f7 100644 --- a/experiments/amp_nchw_autotvm/run.py +++ b/experiments/amp_nchw_autotvm/run.py @@ -71,7 +71,7 @@ def main(_): torch_model = ( timm.create_model("tf_efficientnetv2_s", pretrained=True) if FLAGS.model == "efficientnet_v2_s" - else getattr(torchvision.models, FLAGS.model)(pretrained=True) + else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") ) torch_model.eval() scripted_torch_model = torch.jit.trace(torch_model, torch.randn(data.shape)).eval() diff --git a/experiments/amp_nhwc_autoscheduler/run.py b/experiments/amp_nhwc_autoscheduler/run.py index c42c1b2001b4..00d51360e5ee 100644 --- a/experiments/amp_nhwc_autoscheduler/run.py +++ b/experiments/amp_nhwc_autoscheduler/run.py @@ -70,7 +70,7 @@ def main(_): torch_model = ( timm.create_model("tf_efficientnetv2_s", pretrained=True) if FLAGS.model == "efficientnet_v2_s" - else getattr(torchvision.models, FLAGS.model)(pretrained=True) + else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") ) torch_model.eval() scripted_torch_model = torch.jit.trace(torch_model, torch.randn(data.shape)).eval() From 702935cb409138a5d903e0df82c500af4c8c98db Mon Sep 17 00:00:00 2001 From: Minhae Ye Date: Mon, 28 Aug 2023 12:01:34 +0900 Subject: [PATCH 2/6] Fix tvm warning ``` Desired layout(s) not specified for op: nn.max_pool2d ``` --- experiments/amp_nchw_autotvm/run.py | 2 +- experiments/amp_nhwc_autoscheduler/run.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/experiments/amp_nchw_autotvm/run.py b/experiments/amp_nchw_autotvm/run.py index e055c5a202f7..ad33cd9b0d10 100644 --- a/experiments/amp_nchw_autotvm/run.py +++ b/experiments/amp_nchw_autotvm/run.py @@ -31,7 +31,7 @@ def tvm_amp(mod, params, to_nhwc=False): mod = tvm.relay.transform.ToMixedPrecision()(mod) if to_nhwc: - desired_layouts = {"nn.conv2d": ["NHWC", "default"], "qnn.conv2d": ["NHWC", "default"]} + desired_layouts = {k: ["NHWC", "default"] for k in ["nn.conv2d", "nn.max_pool2d", "qnn.conv2d"]} mod = relay.transform.ConvertLayout(desired_layouts)(mod) mod = tvm.relay.transform.EliminateCommonSubexpr()(mod) diff --git a/experiments/amp_nhwc_autoscheduler/run.py b/experiments/amp_nhwc_autoscheduler/run.py index 00d51360e5ee..26b8afd7c0b2 100644 --- a/experiments/amp_nhwc_autoscheduler/run.py +++ b/experiments/amp_nhwc_autoscheduler/run.py @@ -30,7 +30,7 @@ def tvm_amp(mod, params, to_nhwc=False): mod = tvm.relay.transform.ToMixedPrecision()(mod) if to_nhwc: - desired_layouts = {"nn.conv2d": ["NHWC", "default"], "qnn.conv2d": ["NHWC", "default"]} + desired_layouts = {k: ["NHWC", "default"] for k in ["nn.conv2d", "nn.max_pool2d", "qnn.conv2d"]} mod = relay.transform.ConvertLayout(desired_layouts)(mod) mod = tvm.relay.transform.EliminateCommonSubexpr()(mod) From 587e9970cb17cbf78150e8a6551ce4af944c9cd5 Mon Sep 17 00:00:00 2001 From: Minhae Ye Date: Mon, 28 Aug 2023 18:02:42 +0900 Subject: [PATCH 3/6] Refactor scripts --- experiments/amp_nchw_autotvm/run.py | 362 +++++++++++----------- experiments/amp_nhwc_autoscheduler/run.py | 279 ++++++++--------- 2 files changed, 318 insertions(+), 323 deletions(-) diff --git a/experiments/amp_nchw_autotvm/run.py b/experiments/amp_nchw_autotvm/run.py index ad33cd9b0d10..ad21744ac468 100644 --- a/experiments/amp_nchw_autotvm/run.py +++ b/experiments/amp_nchw_autotvm/run.py @@ -1,218 +1,212 @@ +import os from copy import deepcopy -from PIL import Image import numpy as np import timm import torch import torchvision +from absl import app, flags +from PIL import Image from torchvision import transforms + import tvm -from tvm import relay, auto_scheduler, autotvm -from tvm.contrib.download import download_testdata +from tvm import autotvm, relay +from tvm.autotvm.tuner import GATuner, GridSearchTuner, RandomTuner, XGBTuner from tvm.contrib import graph_executor -from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt -from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner +from tvm.contrib.download import download_testdata + +FLAGS = flags.FLAGS + +flags.DEFINE_enum( + "model", "mobilenet_v2", ["mobilenet_v2", "resnet50", "efficientnet_v2_s"], "Choose model." +) def tvm_amp(mod, params, to_nhwc=False): - mod = tvm.relay.transform.EliminateCommonSubexpr()(mod) + mod = relay.transform.EliminateCommonSubexpr()(mod) - BindPass = tvm.relay.transform.function_pass( - lambda fn, new_mod, ctx: tvm.relay.build_module.bind_params_by_name(fn, params), + BindPass = relay.transform.function_pass( + lambda fn, new_mod, ctx: relay.build_module.bind_params_by_name(fn, params), opt_level=1, ) mod = BindPass(mod) - mod = tvm.relay.transform.FoldConstant()(mod) + mod = relay.transform.FoldConstant()(mod) - mod = tvm.relay.transform.CombineParallelBatchMatmul()(mod) - mod = tvm.relay.transform.FoldConstant()(mod) + mod = relay.transform.CombineParallelBatchMatmul()(mod) + mod = relay.transform.FoldConstant()(mod) - mod = tvm.relay.transform.InferType()(mod) - mod = tvm.relay.transform.ToMixedPrecision()(mod) + mod = relay.transform.InferType()(mod) + mod = relay.transform.ToMixedPrecision()(mod) if to_nhwc: - desired_layouts = {k: ["NHWC", "default"] for k in ["nn.conv2d", "nn.max_pool2d", "qnn.conv2d"]} + desired_layouts = { + k: ["NHWC", "default"] for k in ["nn.conv2d", "nn.max_pool2d", "qnn.conv2d"] + } mod = relay.transform.ConvertLayout(desired_layouts)(mod) - mod = tvm.relay.transform.EliminateCommonSubexpr()(mod) - mod = tvm.relay.transform.FoldConstant()(mod) + mod = relay.transform.EliminateCommonSubexpr()(mod) + mod = relay.transform.FoldConstant()(mod) return mod -if __name__ == "__main__": - import os - - from absl import app, flags - +def main(_): os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin/" - FLAGS = flags.FLAGS + # Prepare test data + img_url = "https://github.com/pytorch/hub/raw/master/images/dog.jpg" + img_path = download_testdata(img_url, "dog.jpg", module="data") + img = Image.open(img_path) + preprocess_input = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + ) + data = preprocess_input(img) + data = np.expand_dims(data, 0) + + synset_url = ( + "https://gist.githubusercontent.com/zhreshold/" + "4d0b62f3d01426887599d4f7ede23ee5/raw/" + "596b27d23537e5a1b5751d2b0481ef172f58b539/" + "imagenet1000_clsid_to_human.txt" + ) + synset_name = "imagenet1000_clsid_to_human.txt" + synset_path = download_testdata(synset_url, synset_name, module="data") + with open(synset_path) as f: + synset = eval(f.read()) + + # Prepare model + torch_model = ( + timm.create_model("tf_efficientnetv2_s", pretrained=True) + if FLAGS.model == "efficientnet_v2_s" + else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") + ) + torch_model.eval() + scripted_torch_model = torch.jit.trace(torch_model, torch.randn(data.shape)).eval() + input_name = "input_1" + shape_list = [(input_name, data.shape)] + mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) + mod = tvm_amp(mod, params, to_nhwc=False) + params = None + + # Extract tasks from the network + target = tvm.target.Target( + "cuda -arch=sm_72", host="llvm -mtriple=aarch64-linux-gnu -mcpu=carmel" + ) + device_key = "xavier" + host = "0.0.0.0" + port = 9190 + log_file = f"{FLAGS.model}.json" + tuning_option = { + "log_filename": log_file, + "tuner": "xgb", + "n_trial": 4000, + "early_stopping": 600, + "measure_option": autotvm.measure_option( + builder=autotvm.LocalBuilder(timeout=60), + runner=autotvm.RPCRunner( + device_key, host=host, port=port, repeat=1, min_repeat_ms=300, timeout=600 + ), + ), + } + + print("Extract tasks...") + tasks = autotvm.task.extract_from_program( + mod["main"], + target=target, + params=params, + ) - flags.DEFINE_enum( - "model", "mobilenet_v2", ["mobilenet_v2", "resnet50", "efficientnet_v2_s"], "Choose model." + def tune_tasks( + tasks, + measure_option, + tuner="xgb", + n_trial=1000, + early_stopping=None, + log_filename="tuning.log", + use_transfer_learning=True, + ): + print("Begin tuning...") + for i, tsk in enumerate(reversed(tasks)): + prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) + + # create tuner + if tuner == "xgb" or tuner == "xgb-rank": + tuner_obj = XGBTuner(tsk, loss_type="rank") + elif tuner == "ga": + tuner_obj = GATuner(tsk, pop_size=50) + elif tuner == "random": + tuner_obj = RandomTuner(tsk) + elif tuner == "gridsearch": + tuner_obj = GridSearchTuner(tsk) + else: + raise ValueError("Invalid tuner: " + tuner) + + if use_transfer_learning: + if os.path.isfile(log_file): + tuner_obj.load_history(autotvm.record.load_from_file(log_file)) + + # do tuning + tsk_trial = min(n_trial, len(tsk.config_space)) + tuner_obj.tune( + n_trial=tsk_trial, + early_stopping=early_stopping, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(tsk_trial, prefix=prefix), + autotvm.callback.log_to_file(log_file), + ], + ) + + tune_tasks(tasks, **tuning_option) + + # Compile with the history best + print("Compile...") + with autotvm.apply_history_best(log_file): + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target) + lib_file = f"{FLAGS.model}.tar" + lib.export_library(lib_file) + + remote = autotvm.measure.request_remote( + device_key=device_key, host=host, port=port, timeout=180 ) + dev = remote.device(str(target)) - def main(_): - img_url = "https://github.com/pytorch/hub/raw/master/images/dog.jpg" - img_path = download_testdata(img_url, "dog.jpg", module="data") - img = Image.open(img_path) - preprocess_input = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] - ) - data = preprocess_input(img) - data = np.expand_dims(data, 0) - - torch_model = ( - timm.create_model("tf_efficientnetv2_s", pretrained=True) - if FLAGS.model == "efficientnet_v2_s" - else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") - ) - torch_model.eval() - scripted_torch_model = torch.jit.trace(torch_model, torch.randn(data.shape)).eval() - shape_list = [("input_1", data.shape)] - mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) - mod = tvm_amp(mod, params, to_nhwc=False) - params = None - - target = tvm.target.Target( - "cuda -arch=sm_72", host="llvm -mtriple=aarch64-linux-gnu -mcpu=carmel" - ) - device_key = "xavier" - host = "0.0.0.0" - port = 9190 - - log_file = f"{FLAGS.model}.json" - lib_file = f"{FLAGS.model}.tar" - - tuning_option = { - "log_filename": log_file, - "tuner": "xgb", - "n_trial": 4000, - "early_stopping": 600, - "measure_option": autotvm.measure_option( - builder=autotvm.LocalBuilder(timeout=60), - runner=autotvm.RPCRunner( - device_key, - host=host, - port=port, - repeat=1, - min_repeat_ms=300, - timeout=600 - ), - ), - } + remote.upload(lib_file) + lib = remote.load_module(lib_file) + + # Create graph executor + module = graph_executor.GraphModule(lib["default"](dev)) + dtype = "float32" + data_tvm = tvm.nd.array(data.astype(dtype)) + module.set_input(input_name, data_tvm) - print("Extract tasks...") - tasks = autotvm.task.extract_from_program( - mod["main"], - target=target, - params=params, - ) - - def tune_tasks( - tasks, - measure_option, - tuner="xgb", - n_trial=1000, - early_stopping=None, - log_filename="tuning.log", - use_transfer_learning=True, - ): - for i, tsk in enumerate(reversed(tasks)): - prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) - - # create tuner - if tuner == "xgb" or tuner == "xgb-rank": - tuner_obj = XGBTuner(tsk, loss_type="rank") - elif tuner == "ga": - tuner_obj = GATuner(tsk, pop_size=50) - elif tuner == "random": - tuner_obj = RandomTuner(tsk) - elif tuner == "gridsearch": - tuner_obj = GridSearchTuner(tsk) - else: - raise ValueError("Invalid tuner: " + tuner) - - if use_transfer_learning: - if os.path.isfile(log_file): - tuner_obj.load_history(autotvm.record.load_from_file(log_file)) - - # do tuning - tsk_trial = min(n_trial, len(tsk.config_space)) - tuner_obj.tune( - n_trial=tsk_trial, - early_stopping=early_stopping, - measure_option=measure_option, - callbacks=[ - autotvm.callback.progress_bar(tsk_trial, prefix=prefix), - autotvm.callback.log_to_file(log_file), - ], - ) - - print("Tuning...") - tune_tasks(tasks, **tuning_option) - - # Compile with the history best - print("Compile...") - with autotvm.apply_history_best(log_file): - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target) - lib.export_library(lib_file) - - remote = tvm.auto_scheduler.utils.request_remote( - device_key=device_key, host=host, port=port, timeout=180 - ) - dev = remote.device(str(target)) - - remote.upload(lib_file) - lib = remote.load_module(lib_file) - - # Create graph executor - module = graph_executor.GraphModule(lib["default"](dev)) - dtype = "float32" - data_tvm = tvm.nd.array(data.astype(dtype)) - module.set_input("input_1", data_tvm) - - # Evaluate - print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", dev, repeat=50) - prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond - print( - "Mean inference time (std dev): %.2f ms (%.2f ms)" - % (np.mean(prof_res), np.std(prof_res)) - ) - - module.run() - tvm_out = module.get_output(0) - top1_tvm = np.argmax(tvm_out.asnumpy()) - - synset_url = "".join( - [ - "https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt", - ] - ) - synset_name = "imagenet1000_clsid_to_human.txt" - synset_path = download_testdata(synset_url, synset_name, module="data") - with open(synset_path) as f: - synset = eval(f.read()) - print("Relay top-1 id: {}, class name: {}".format(top1_tvm, synset[top1_tvm])) - # confirm correctness with torch output - with torch.no_grad(): - torch_img = torch.from_numpy(data) - output = torch_model(torch_img) - - # Get top-1 result for PyTorch - top1_torch = np.argmax(output.numpy()) - - print("Torch top-1 id: {}, class name: {}".format(top1_torch, synset[top1_torch])) + # Evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", dev, repeat=50) + prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond + print(f"Mean inference time (std dev): {np.mean(prof_res):.2f} ms ({np.std(prof_res):.2f} ms)") + module.run() + tvm_out = module.get_output(0) + top1_tvm = np.argmax(tvm_out.asnumpy()) + + print(f"Relay top-1 id: {top1_tvm}, class name: {synset[top1_tvm]}") + # confirm correctness with torch output + with torch.no_grad(): + torch_img = torch.from_numpy(data) + output = torch_model(torch_img) + + # Get top-1 result for PyTorch + top1_torch = np.argmax(output.numpy()) + + print(f"Torch top-1 id: {top1_torch}, class name: {synset[top1_torch]}") + + +if __name__ == "__main__": app.run(main) diff --git a/experiments/amp_nhwc_autoscheduler/run.py b/experiments/amp_nhwc_autoscheduler/run.py index 26b8afd7c0b2..a93e1485136b 100644 --- a/experiments/amp_nhwc_autoscheduler/run.py +++ b/experiments/amp_nhwc_autoscheduler/run.py @@ -1,177 +1,178 @@ +import os from copy import deepcopy -from PIL import Image import numpy as np import timm import torch import torchvision +from absl import app, flags +from PIL import Image from torchvision import transforms + import tvm -from tvm import relay, auto_scheduler, autotvm -from tvm.contrib.download import download_testdata +from tvm import auto_scheduler, relay from tvm.contrib import graph_executor -from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt +from tvm.contrib.download import download_testdata + +FLAGS = flags.FLAGS + +flags.DEFINE_enum( + "model", "mobilenet_v2", ["mobilenet_v2", "resnet50", "efficientnet_v2_s"], "Choose model." +) def tvm_amp(mod, params, to_nhwc=False): - mod = tvm.relay.transform.EliminateCommonSubexpr()(mod) + mod = relay.transform.EliminateCommonSubexpr()(mod) - BindPass = tvm.relay.transform.function_pass( - lambda fn, new_mod, ctx: tvm.relay.build_module.bind_params_by_name(fn, params), + BindPass = relay.transform.function_pass( + lambda fn, new_mod, ctx: relay.build_module.bind_params_by_name(fn, params), opt_level=1, ) mod = BindPass(mod) - mod = tvm.relay.transform.FoldConstant()(mod) + mod = relay.transform.FoldConstant()(mod) - mod = tvm.relay.transform.CombineParallelBatchMatmul()(mod) - mod = tvm.relay.transform.FoldConstant()(mod) + mod = relay.transform.CombineParallelBatchMatmul()(mod) + mod = relay.transform.FoldConstant()(mod) - mod = tvm.relay.transform.InferType()(mod) - mod = tvm.relay.transform.ToMixedPrecision()(mod) + mod = relay.transform.InferType()(mod) + mod = relay.transform.ToMixedPrecision()(mod) if to_nhwc: - desired_layouts = {k: ["NHWC", "default"] for k in ["nn.conv2d", "nn.max_pool2d", "qnn.conv2d"]} + desired_layouts = { + k: ["NHWC", "default"] for k in ["nn.conv2d", "nn.max_pool2d", "qnn.conv2d"] + } mod = relay.transform.ConvertLayout(desired_layouts)(mod) - mod = tvm.relay.transform.EliminateCommonSubexpr()(mod) - mod = tvm.relay.transform.FoldConstant()(mod) + mod = relay.transform.EliminateCommonSubexpr()(mod) + mod = relay.transform.FoldConstant()(mod) return mod -if __name__ == "__main__": - import os - - from absl import app, flags - +def main(_): os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin/" - FLAGS = flags.FLAGS - - flags.DEFINE_enum( - "model", "mobilenet_v2", ["mobilenet_v2", "resnet50", "efficientnet_v2_s"], "Choose model." + # Prepare test data + img_url = "https://github.com/pytorch/hub/raw/master/images/dog.jpg" + img_path = download_testdata(img_url, "dog.jpg", module="data") + img = Image.open(img_path) + preprocess_input = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] ) - - def main(_): - img_url = "https://github.com/pytorch/hub/raw/master/images/dog.jpg" - img_path = download_testdata(img_url, "dog.jpg", module="data") - img = Image.open(img_path) - preprocess_input = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] + data = preprocess_input(img) + data = np.expand_dims(data, 0) + + synset_url = ( + "https://gist.githubusercontent.com/zhreshold/" + "4d0b62f3d01426887599d4f7ede23ee5/raw/" + "596b27d23537e5a1b5751d2b0481ef172f58b539/" + "imagenet1000_clsid_to_human.txt" + ) + synset_name = "imagenet1000_clsid_to_human.txt" + synset_path = download_testdata(synset_url, synset_name, module="data") + with open(synset_path) as f: + synset = eval(f.read()) + + # Prepare model + torch_model = ( + timm.create_model("tf_efficientnetv2_s", pretrained=True) + if FLAGS.model == "efficientnet_v2_s" + else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") + ) + torch_model.eval() + scripted_torch_model = torch.jit.trace(torch_model, torch.randn(data.shape)).eval() + input_name = "input_1" + shape_list = [(input_name, data.shape)] + mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) + mod = tvm_amp(mod, params, to_nhwc=True) + params = None + + # Extract tasks from the network + target = tvm.target.Target( + "cuda -arch=sm_72", host="llvm -mtriple=aarch64-linux-gnu -mcpu=carmel" + ) + device_key = "xavier" + host = "0.0.0.0" + port = 9190 + log_file = f"{FLAGS.model}.json" + + print("Extract tasks...") + tasks, task_weights = auto_scheduler.extract_tasks(deepcopy(mod), params, target) + + for idx, task in enumerate(tasks): + print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) + print(task.compute_dag) + + def run_tuning(): + print("Begin tuning...") + remote_runner = auto_scheduler.RPCRunner( + key=device_key, host=host, port=port, repeat=1, min_repeat_ms=300, timeout=600 ) - data = preprocess_input(img) - data = np.expand_dims(data, 0) - torch_model = ( - timm.create_model("tf_efficientnetv2_s", pretrained=True) - if FLAGS.model == "efficientnet_v2_s" - else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") + tuner = auto_scheduler.TaskScheduler( + tasks, task_weights, strategy="round-robin", load_log_file=log_file ) - torch_model.eval() - scripted_torch_model = torch.jit.trace(torch_model, torch.randn(data.shape)).eval() - shape_list = [("input_1", data.shape)] - mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) - mod = tvm_amp(mod, params, to_nhwc=True) - params = None - - target = tvm.target.Target( - "cuda -arch=sm_72", host="llvm -mtriple=aarch64-linux-gnu -mcpu=carmel" - ) - device_key = "xavier" - host = "0.0.0.0" - port = 9190 - - log_file = f"{FLAGS.model}.json" - lib_file = f"{FLAGS.model}.tar" - - # Extract tasks from the network - print("Extract tasks...") - tasks, task_weights = auto_scheduler.extract_tasks(deepcopy(mod), params, target) - - for idx, task in enumerate(tasks): - print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) - print(task.compute_dag) - - def run_tuning(): - print("Begin tuning...") - remote_runner = auto_scheduler.RPCRunner( - key=device_key, host=host, port=port, repeat=1, min_repeat_ms=300, timeout=600 - ) - - tuner = auto_scheduler.TaskScheduler(tasks, task_weights, strategy='round-robin', load_log_file=log_file) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=len(tasks) * 2000, - builder=auto_scheduler.LocalBuilder(timeout=60), - runner=remote_runner, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - - tuner.tune(tune_option, per_task_early_stopping=600) - - run_tuning() - - # Compile with the history best - print("Compile...") - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, config={"relay.backend.use_auto_scheduler": True} - ): - lib = relay.build(mod, target=target) - lib.export_library(lib_file) - - remote = tvm.auto_scheduler.utils.request_remote( - device_key=device_key, host=host, port=port, timeout=180 - ) - dev = remote.device(str(target)) - - remote.upload(lib_file) - lib = remote.load_module(lib_file) - - # Create graph executor - module = graph_executor.GraphModule(lib["default"](dev)) - dtype = "float32" - data_tvm = tvm.nd.array(data.astype(dtype)) - module.set_input("input_1", data_tvm) - - # Evaluate - print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", dev, repeat=50) - prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond - print( - "Mean inference time (std dev): %.2f ms (%.2f ms)" - % (np.mean(prof_res), np.std(prof_res)) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=len(tasks) * 2000, + builder=auto_scheduler.LocalBuilder(timeout=60), + runner=remote_runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) - module.run() - tvm_out = module.get_output(0) - top1_tvm = np.argmax(tvm_out.asnumpy()) - - synset_url = "".join( - [ - "https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt", - ] - ) - synset_name = "imagenet1000_clsid_to_human.txt" - synset_path = download_testdata(synset_url, synset_name, module="data") - with open(synset_path) as f: - synset = eval(f.read()) - print("Relay top-1 id: {}, class name: {}".format(top1_tvm, synset[top1_tvm])) - # confirm correctness with torch output - with torch.no_grad(): - torch_img = torch.from_numpy(data) - output = torch_model(torch_img) + tuner.tune(tune_option, per_task_early_stopping=600) + + run_tuning() + + # Compile with the history best + print("Compile...") + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, config={"relay.backend.use_auto_scheduler": True} + ): + lib = relay.build(mod, target=target) + lib_file = f"{FLAGS.model}.tar" + lib.export_library(lib_file) + + remote = tvm.auto_scheduler.utils.request_remote( + device_key=device_key, host=host, port=port, timeout=180 + ) + dev = remote.device(str(target)) - # Get top-1 result for PyTorch - top1_torch = np.argmax(output.numpy()) + remote.upload(lib_file) + lib = remote.load_module(lib_file) - print("Torch top-1 id: {}, class name: {}".format(top1_torch, synset[top1_torch])) + # Create graph executor + module = graph_executor.GraphModule(lib["default"](dev)) + dtype = "float32" + data_tvm = tvm.nd.array(data.astype(dtype)) + module.set_input(input_name, data_tvm) + # Evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", dev, repeat=50) + prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond + print(f"Mean inference time (std dev): {np.mean(prof_res):.2f} ms ({np.std(prof_res):.2f} ms)") + + module.run() + tvm_out = module.get_output(0) + top1_tvm = np.argmax(tvm_out.asnumpy()) + + print(f"Relay top-1 id: {top1_tvm}, class name: {synset[top1_tvm]}") + # confirm correctness with torch output + with torch.no_grad(): + torch_img = torch.from_numpy(data) + output = torch_model(torch_img) + + # Get top-1 result for PyTorch + top1_torch = np.argmax(output.numpy()) + + print(f"Torch top-1 id: {top1_torch}, class name: {synset[top1_torch]}") + + +if __name__ == "__main__": app.run(main) From f55dfbdb2bd9ef47a4b5a2403d34719b8fa6a063 Mon Sep 17 00:00:00 2001 From: Minhae Ye Date: Fri, 1 Sep 2023 21:41:35 +0900 Subject: [PATCH 4/6] Add local run script --- .../amp_nchw_autotvm/{run.py => tune.py} | 0 experiments/amp_nhwc_autoscheduler/run.py | 90 +++------ experiments/amp_nhwc_autoscheduler/tune.py | 178 ++++++++++++++++++ 3 files changed, 209 insertions(+), 59 deletions(-) rename experiments/amp_nchw_autotvm/{run.py => tune.py} (100%) create mode 100644 experiments/amp_nhwc_autoscheduler/tune.py diff --git a/experiments/amp_nchw_autotvm/run.py b/experiments/amp_nchw_autotvm/tune.py similarity index 100% rename from experiments/amp_nchw_autotvm/run.py rename to experiments/amp_nchw_autotvm/tune.py diff --git a/experiments/amp_nhwc_autoscheduler/run.py b/experiments/amp_nhwc_autoscheduler/run.py index a93e1485136b..7b894083228d 100644 --- a/experiments/amp_nhwc_autoscheduler/run.py +++ b/experiments/amp_nhwc_autoscheduler/run.py @@ -1,6 +1,7 @@ import os from copy import deepcopy +import cv2 import numpy as np import timm import torch @@ -85,68 +86,34 @@ def main(_): else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") ) torch_model.eval() - scripted_torch_model = torch.jit.trace(torch_model, torch.randn(data.shape)).eval() + + dummy_input = torch.randn(data.shape) + scripted_torch_model = torch.jit.trace(torch_model, dummy_input).eval() input_name = "input_1" shape_list = [(input_name, data.shape)] mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) mod = tvm_amp(mod, params, to_nhwc=True) params = None - # Extract tasks from the network - target = tvm.target.Target( - "cuda -arch=sm_72", host="llvm -mtriple=aarch64-linux-gnu -mcpu=carmel" - ) - device_key = "xavier" - host = "0.0.0.0" - port = 9190 - log_file = f"{FLAGS.model}.json" - - print("Extract tasks...") - tasks, task_weights = auto_scheduler.extract_tasks(deepcopy(mod), params, target) - - for idx, task in enumerate(tasks): - print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) - print(task.compute_dag) - - def run_tuning(): - print("Begin tuning...") - remote_runner = auto_scheduler.RPCRunner( - key=device_key, host=host, port=port, repeat=1, min_repeat_ms=300, timeout=600 - ) - - tuner = auto_scheduler.TaskScheduler( - tasks, task_weights, strategy="round-robin", load_log_file=log_file - ) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=len(tasks) * 2000, - builder=auto_scheduler.LocalBuilder(timeout=60), - runner=remote_runner, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - - tuner.tune(tune_option, per_task_early_stopping=600) - - run_tuning() + onnx_file = f"{FLAGS.model}.onnx" + torch.onnx.export(torch_model, dummy_input, onnx_file, input_names=[input_name]) + opencv_net = cv2.dnn.readNetFromONNX(onnx_file) + opencv_net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV) + opencv_net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU) + opencv_net.setInput(data) # Compile with the history best + log_file = f"{FLAGS.model}.json" + target = tvm.target.Target("cuda -arch=sm_72", host="llvm -mcpu=carmel") print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True} ): lib = relay.build(mod, target=target) - lib_file = f"{FLAGS.model}.tar" - lib.export_library(lib_file) - - remote = tvm.auto_scheduler.utils.request_remote( - device_key=device_key, host=host, port=port, timeout=180 - ) - dev = remote.device(str(target)) - - remote.upload(lib_file) - lib = remote.load_module(lib_file) # Create graph executor + dev = str(target) module = graph_executor.GraphModule(lib["default"](dev)) dtype = "float32" data_tvm = tvm.nd.array(data.astype(dtype)) @@ -154,24 +121,29 @@ def run_tuning(): # Evaluate print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", dev, repeat=50) - prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond - print(f"Mean inference time (std dev): {np.mean(prof_res):.2f} ms ({np.std(prof_res):.2f} ms)") - + repeat = 50 + ftimer = module.module.time_evaluator("run", dev, repeat=repeat) + tvm_prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond + print( + f"Relay Mean inference time (std dev): {np.mean(tvm_prof_res):.2f} ms ({np.std(tvm_prof_res):.2f} ms)" + ) module.run() tvm_out = module.get_output(0) top1_tvm = np.argmax(tvm_out.asnumpy()) - print(f"Relay top-1 id: {top1_tvm}, class name: {synset[top1_tvm]}") - # confirm correctness with torch output - with torch.no_grad(): - torch_img = torch.from_numpy(data) - output = torch_model(torch_img) - # Get top-1 result for PyTorch - top1_torch = np.argmax(output.numpy()) - - print(f"Torch top-1 id: {top1_torch}, class name: {synset[top1_torch]}") + cv2.setNumThreads(1) + opencv_prof_res = [] + opencv_out = opencv_net.forward() + for _ in range(repeat): + opencv_net.forward() + opencv_prof_res.append(opencv_net.getPerfProfile()[0]) + opencv_prof_res = np.array(opencv_prof_res) * 1000.0 / cv2.getTickFrequency() + print( + f"OpenCV Mean inference time (std dev): {np.mean(opencv_prof_res):.2f} ms ({np.std(opencv_prof_res):.2f} ms)" + ) + top1_opencv = np.argmax(opencv_out) + print(f"OpenCV top-1 id: {top1_opencv}, class name: {synset[top1_opencv]}") if __name__ == "__main__": diff --git a/experiments/amp_nhwc_autoscheduler/tune.py b/experiments/amp_nhwc_autoscheduler/tune.py new file mode 100644 index 000000000000..a93e1485136b --- /dev/null +++ b/experiments/amp_nhwc_autoscheduler/tune.py @@ -0,0 +1,178 @@ +import os +from copy import deepcopy + +import numpy as np +import timm +import torch +import torchvision +from absl import app, flags +from PIL import Image +from torchvision import transforms + +import tvm +from tvm import auto_scheduler, relay +from tvm.contrib import graph_executor +from tvm.contrib.download import download_testdata + +FLAGS = flags.FLAGS + +flags.DEFINE_enum( + "model", "mobilenet_v2", ["mobilenet_v2", "resnet50", "efficientnet_v2_s"], "Choose model." +) + + +def tvm_amp(mod, params, to_nhwc=False): + mod = relay.transform.EliminateCommonSubexpr()(mod) + + BindPass = relay.transform.function_pass( + lambda fn, new_mod, ctx: relay.build_module.bind_params_by_name(fn, params), + opt_level=1, + ) + mod = BindPass(mod) + mod = relay.transform.FoldConstant()(mod) + + mod = relay.transform.CombineParallelBatchMatmul()(mod) + mod = relay.transform.FoldConstant()(mod) + + mod = relay.transform.InferType()(mod) + mod = relay.transform.ToMixedPrecision()(mod) + + if to_nhwc: + desired_layouts = { + k: ["NHWC", "default"] for k in ["nn.conv2d", "nn.max_pool2d", "qnn.conv2d"] + } + mod = relay.transform.ConvertLayout(desired_layouts)(mod) + + mod = relay.transform.EliminateCommonSubexpr()(mod) + mod = relay.transform.FoldConstant()(mod) + + return mod + + +def main(_): + os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin/" + + # Prepare test data + img_url = "https://github.com/pytorch/hub/raw/master/images/dog.jpg" + img_path = download_testdata(img_url, "dog.jpg", module="data") + img = Image.open(img_path) + preprocess_input = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + ) + data = preprocess_input(img) + data = np.expand_dims(data, 0) + + synset_url = ( + "https://gist.githubusercontent.com/zhreshold/" + "4d0b62f3d01426887599d4f7ede23ee5/raw/" + "596b27d23537e5a1b5751d2b0481ef172f58b539/" + "imagenet1000_clsid_to_human.txt" + ) + synset_name = "imagenet1000_clsid_to_human.txt" + synset_path = download_testdata(synset_url, synset_name, module="data") + with open(synset_path) as f: + synset = eval(f.read()) + + # Prepare model + torch_model = ( + timm.create_model("tf_efficientnetv2_s", pretrained=True) + if FLAGS.model == "efficientnet_v2_s" + else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") + ) + torch_model.eval() + scripted_torch_model = torch.jit.trace(torch_model, torch.randn(data.shape)).eval() + input_name = "input_1" + shape_list = [(input_name, data.shape)] + mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) + mod = tvm_amp(mod, params, to_nhwc=True) + params = None + + # Extract tasks from the network + target = tvm.target.Target( + "cuda -arch=sm_72", host="llvm -mtriple=aarch64-linux-gnu -mcpu=carmel" + ) + device_key = "xavier" + host = "0.0.0.0" + port = 9190 + log_file = f"{FLAGS.model}.json" + + print("Extract tasks...") + tasks, task_weights = auto_scheduler.extract_tasks(deepcopy(mod), params, target) + + for idx, task in enumerate(tasks): + print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) + print(task.compute_dag) + + def run_tuning(): + print("Begin tuning...") + remote_runner = auto_scheduler.RPCRunner( + key=device_key, host=host, port=port, repeat=1, min_repeat_ms=300, timeout=600 + ) + + tuner = auto_scheduler.TaskScheduler( + tasks, task_weights, strategy="round-robin", load_log_file=log_file + ) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=len(tasks) * 2000, + builder=auto_scheduler.LocalBuilder(timeout=60), + runner=remote_runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + ) + + tuner.tune(tune_option, per_task_early_stopping=600) + + run_tuning() + + # Compile with the history best + print("Compile...") + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, config={"relay.backend.use_auto_scheduler": True} + ): + lib = relay.build(mod, target=target) + lib_file = f"{FLAGS.model}.tar" + lib.export_library(lib_file) + + remote = tvm.auto_scheduler.utils.request_remote( + device_key=device_key, host=host, port=port, timeout=180 + ) + dev = remote.device(str(target)) + + remote.upload(lib_file) + lib = remote.load_module(lib_file) + + # Create graph executor + module = graph_executor.GraphModule(lib["default"](dev)) + dtype = "float32" + data_tvm = tvm.nd.array(data.astype(dtype)) + module.set_input(input_name, data_tvm) + + # Evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", dev, repeat=50) + prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond + print(f"Mean inference time (std dev): {np.mean(prof_res):.2f} ms ({np.std(prof_res):.2f} ms)") + + module.run() + tvm_out = module.get_output(0) + top1_tvm = np.argmax(tvm_out.asnumpy()) + + print(f"Relay top-1 id: {top1_tvm}, class name: {synset[top1_tvm]}") + # confirm correctness with torch output + with torch.no_grad(): + torch_img = torch.from_numpy(data) + output = torch_model(torch_img) + + # Get top-1 result for PyTorch + top1_torch = np.argmax(output.numpy()) + + print(f"Torch top-1 id: {top1_torch}, class name: {synset[top1_torch]}") + + +if __name__ == "__main__": + app.run(main) From 3710e286b5837c4fbf079db2aaf143f7bcd1ff71 Mon Sep 17 00:00:00 2001 From: Minhae Ye Date: Fri, 1 Sep 2023 20:22:48 +0900 Subject: [PATCH 5/6] Choose backend --- experiments/amp_nhwc_autoscheduler/run.py | 115 ++++++++++++---------- 1 file changed, 61 insertions(+), 54 deletions(-) diff --git a/experiments/amp_nhwc_autoscheduler/run.py b/experiments/amp_nhwc_autoscheduler/run.py index 7b894083228d..7743253d9e40 100644 --- a/experiments/amp_nhwc_autoscheduler/run.py +++ b/experiments/amp_nhwc_autoscheduler/run.py @@ -17,6 +17,7 @@ FLAGS = flags.FLAGS +flags.DEFINE_enum("backend", "opencv_cpu", ["opencv_cpu", "tvm_gpu"], "Choose backend.") flags.DEFINE_enum( "model", "mobilenet_v2", ["mobilenet_v2", "resnet50", "efficientnet_v2_s"], "Choose model." ) @@ -88,62 +89,68 @@ def main(_): torch_model.eval() dummy_input = torch.randn(data.shape) - scripted_torch_model = torch.jit.trace(torch_model, dummy_input).eval() input_name = "input_1" - shape_list = [(input_name, data.shape)] - mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) - mod = tvm_amp(mod, params, to_nhwc=True) - params = None - - onnx_file = f"{FLAGS.model}.onnx" - torch.onnx.export(torch_model, dummy_input, onnx_file, input_names=[input_name]) - opencv_net = cv2.dnn.readNetFromONNX(onnx_file) - opencv_net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV) - opencv_net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU) - opencv_net.setInput(data) - - # Compile with the history best - log_file = f"{FLAGS.model}.json" - target = tvm.target.Target("cuda -arch=sm_72", host="llvm -mcpu=carmel") - print("Compile...") - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, config={"relay.backend.use_auto_scheduler": True} - ): - lib = relay.build(mod, target=target) - - # Create graph executor - dev = str(target) - module = graph_executor.GraphModule(lib["default"](dev)) - dtype = "float32" - data_tvm = tvm.nd.array(data.astype(dtype)) - module.set_input(input_name, data_tvm) - - # Evaluate - print("Evaluate inference time cost...") repeat = 50 - ftimer = module.module.time_evaluator("run", dev, repeat=repeat) - tvm_prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond - print( - f"Relay Mean inference time (std dev): {np.mean(tvm_prof_res):.2f} ms ({np.std(tvm_prof_res):.2f} ms)" - ) - module.run() - tvm_out = module.get_output(0) - top1_tvm = np.argmax(tvm_out.asnumpy()) - print(f"Relay top-1 id: {top1_tvm}, class name: {synset[top1_tvm]}") - - cv2.setNumThreads(1) - opencv_prof_res = [] - opencv_out = opencv_net.forward() - for _ in range(repeat): - opencv_net.forward() - opencv_prof_res.append(opencv_net.getPerfProfile()[0]) - opencv_prof_res = np.array(opencv_prof_res) * 1000.0 / cv2.getTickFrequency() - print( - f"OpenCV Mean inference time (std dev): {np.mean(opencv_prof_res):.2f} ms ({np.std(opencv_prof_res):.2f} ms)" - ) - top1_opencv = np.argmax(opencv_out) - print(f"OpenCV top-1 id: {top1_opencv}, class name: {synset[top1_opencv]}") + + if FLAGS.backend == "opencv_cpu": + onnx_file = f"{FLAGS.model}.onnx" + torch.onnx.export( + torch_model, dummy_input, onnx_file, input_names=[input_name], opset_version=11 + ) + opencv_net = cv2.dnn.readNetFromONNX(onnx_file) + opencv_net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV) + opencv_net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU) + opencv_net.setInput(data) + + # Evaluate + print("Evaluate inference time cost...") + opencv_prof_res = [] + opencv_out = opencv_net.forward() + for _ in range(repeat): + opencv_net.forward() + opencv_prof_res.append(opencv_net.getPerfProfile()[0]) + opencv_prof_res = np.array(opencv_prof_res) * 1000.0 / cv2.getTickFrequency() + print( + f"OpenCV Mean inference time (std dev): {np.mean(opencv_prof_res):.2f} ms ({np.std(opencv_prof_res):.2f} ms)" + ) + top1_opencv = np.argmax(opencv_out) + print(f"OpenCV top-1 id: {top1_opencv}, class name: {synset[top1_opencv]}") + else: + scripted_torch_model = torch.jit.trace(torch_model, dummy_input).eval() + shape_list = [(input_name, data.shape)] + mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) + mod = tvm_amp(mod, params, to_nhwc=True) + params = None + + # Compile with the history best + log_file = f"{FLAGS.model}.json" + target = tvm.target.Target("cuda -arch=sm_72", host="llvm -mcpu=carmel") + print("Compile...") + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, config={"relay.backend.use_auto_scheduler": True} + ): + lib = relay.build(mod, target=target) + + # Create graph executor + dev = tvm.device(str(target)) + module = graph_executor.GraphModule(lib["default"](dev)) + dtype = "float32" + data_tvm = tvm.nd.array(data.astype(dtype)) + module.set_input(input_name, data_tvm) + + # Evaluate + print("Evaluate inference time cost...") + + ftimer = module.module.time_evaluator("run", dev, repeat=repeat) + tvm_prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond + print( + f"Relay Mean inference time (std dev): {np.mean(tvm_prof_res):.2f} ms ({np.std(tvm_prof_res):.2f} ms)" + ) + module.run() + tvm_out = module.get_output(0) + top1_tvm = np.argmax(tvm_out.asnumpy()) + print(f"Relay top-1 id: {top1_tvm}, class name: {synset[top1_tvm]}") if __name__ == "__main__": From 0d391bb9a72590c6bd9cdda9c670ee2b05e33653 Mon Sep 17 00:00:00 2001 From: Minhae Ye Date: Fri, 1 Sep 2023 20:24:17 +0900 Subject: [PATCH 6/6] Cache model --- experiments/amp_nhwc_autoscheduler/run.py | 68 ++++++++++++++--------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/experiments/amp_nhwc_autoscheduler/run.py b/experiments/amp_nhwc_autoscheduler/run.py index 7743253d9e40..6b4c503dc1f6 100644 --- a/experiments/amp_nhwc_autoscheduler/run.py +++ b/experiments/amp_nhwc_autoscheduler/run.py @@ -1,5 +1,6 @@ import os from copy import deepcopy +from pathlib import Path import cv2 import numpy as np @@ -17,6 +18,7 @@ FLAGS = flags.FLAGS +flags.DEFINE_bool("cached", True, "Use cached model.") flags.DEFINE_enum("backend", "opencv_cpu", ["opencv_cpu", "tvm_gpu"], "Choose backend.") flags.DEFINE_enum( "model", "mobilenet_v2", ["mobilenet_v2", "resnet50", "efficientnet_v2_s"], "Choose model." @@ -80,24 +82,24 @@ def main(_): with open(synset_path) as f: synset = eval(f.read()) - # Prepare model - torch_model = ( - timm.create_model("tf_efficientnetv2_s", pretrained=True) - if FLAGS.model == "efficientnet_v2_s" - else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") - ) - torch_model.eval() - dummy_input = torch.randn(data.shape) input_name = "input_1" repeat = 50 if FLAGS.backend == "opencv_cpu": - onnx_file = f"{FLAGS.model}.onnx" - torch.onnx.export( - torch_model, dummy_input, onnx_file, input_names=[input_name], opset_version=11 - ) - opencv_net = cv2.dnn.readNetFromONNX(onnx_file) + # Prepare model + onnx_file = Path(f"{FLAGS.model}.onnx") + if not (FLAGS.cached and onnx_file.exists()): + torch_model = ( + timm.create_model("tf_efficientnetv2_s", pretrained=True) + if FLAGS.model == "efficientnet_v2_s" + else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") + ) + torch_model.eval() + torch.onnx.export( + torch_model, dummy_input, onnx_file, input_names=[input_name], opset_version=11 + ) + opencv_net = cv2.dnn.readNetFromONNX(str(onnx_file)) opencv_net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV) opencv_net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU) opencv_net.setInput(data) @@ -116,21 +118,33 @@ def main(_): top1_opencv = np.argmax(opencv_out) print(f"OpenCV top-1 id: {top1_opencv}, class name: {synset[top1_opencv]}") else: - scripted_torch_model = torch.jit.trace(torch_model, dummy_input).eval() - shape_list = [(input_name, data.shape)] - mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) - mod = tvm_amp(mod, params, to_nhwc=True) - params = None - - # Compile with the history best - log_file = f"{FLAGS.model}.json" + # Prepare model + lib_file = Path(f"{FLAGS.model}.tar") target = tvm.target.Target("cuda -arch=sm_72", host="llvm -mcpu=carmel") - print("Compile...") - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, config={"relay.backend.use_auto_scheduler": True} - ): - lib = relay.build(mod, target=target) + if not (FLAGS.cached and lib_file.exists()): + torch_model = ( + timm.create_model("tf_efficientnetv2_s", pretrained=True) + if FLAGS.model == "efficientnet_v2_s" + else getattr(torchvision.models, FLAGS.model)(weights="DEFAULT") + ) + torch_model.eval() + + scripted_torch_model = torch.jit.trace(torch_model, dummy_input).eval() + shape_list = [(input_name, data.shape)] + mod, params = relay.frontend.from_pytorch(scripted_torch_model, shape_list) + mod = tvm_amp(mod, params, to_nhwc=True) + params = None + + # Compile with the history best + log_file = f"{FLAGS.model}.json" + print("Compile...") + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, config={"relay.backend.use_auto_scheduler": True} + ): + lib = relay.build(mod, target=target) + lib.export_library(lib_file) + lib = tvm.runtime.load_module(lib_file) # Create graph executor dev = tvm.device(str(target))