diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py index 5444794cf1aa..a3299c05bb82 100644 --- a/python/tvm/auto_scheduler/testing/tune_onnx.py +++ b/python/tvm/auto_scheduler/testing/tune_onnx.py @@ -15,18 +15,18 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import json import os - -from distutils.util import strtobool -import numpy as np # type: ignore import onnx # type: ignore + import tvm from tvm import auto_scheduler from tvm import meta_schedule as ms from tvm import relay from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc +from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer from tvm.meta_schedule.utils import cpu_count from tvm.relay.frontend import from_onnx from tvm.support import describe @@ -96,17 +96,23 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + help="example: graph / vm", + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -135,6 +141,7 @@ def main(): repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, + timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": @@ -163,102 +170,63 @@ def main(): onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: - print(f" input_name: {item['name']}") + print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) - tasks, task_weights = auto_scheduler.extract_tasks( - mod["main"], - params, - target=ARGS.target, - hardware_params=hardware_params, - ) - for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): - print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====") - print(task.compute_dag) - - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tuner.tune( - auto_scheduler.TuningOptions( - num_measure_trials=ARGS.num_trials, - runner=runner, - measure_callbacks=[ - auto_scheduler.RecordToFile(log_file), - ], - ), - adaptive_training=ARGS.adaptive_training, - ) - - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_auto_scheduler": True}, - ): - lib = relay.build( - mod, - target=ARGS.target, - params=params, + input_data = { + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + } + + with ms.Profiler() as profiler: + tasks, task_weights = auto_scheduler.extract_tasks( + mod["main"], + params, + target=ARGS.target, + hardware_params=hardware_params, + ) + for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): + print( + f"==== Task {idx}: {task.desc} " + f"(weight {task_weight} key: {task.workload_key}) =====" ) - graph, rt_mod, params = lib.graph_json, lib.lib, lib.params - input_data = {} - for item in ARGS.input_shape: - input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"] - if input_dtype.startswith("float"): - input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype) - else: - input_data[input_name] = np.random.randint( - low=0, high=10000, size=input_shape, dtype=input_dtype + print(task.compute_dag) + + if ARGS.num_trials > 0: + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tuner.tune( + auto_scheduler.TuningOptions( + num_measure_trials=ARGS.num_trials, + runner=runner, + measure_callbacks=[ + auto_scheduler.RecordToFile(log_file), + ], + ), + adaptive_training=ARGS.adaptive_training, ) - def f_timer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.graph_executor import GraphModule - - # pylint: enable=import-outside-toplevel - - mod = GraphModule(rt_mod["default"](dev)) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - ftimer = mod.module.time_evaluator( - "run", - dev, - min_repeat_ms=500, - repeat=3, - ) - results = list(np.array(ftimer().results) * 1000.0) # type: ignore - print("Running time in time_evaluator: ", results) + relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend] + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_auto_scheduler": True}, + ): + lib = relay_build( + mod, + target=ARGS.target, + params=params, + ) + print("Tuning Time:") + print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, - continuation=f_timer, - ) - - def f_per_layer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.debugger.debug_executor import create - - # pylint: enable=import-outside-toplevel - mod = create(graph, rt_mod, dev) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] - graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) - print("|graph_nodes| = ", len(graph_nodes)) - print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)} - for k, v in graph_nodes_time.items(): - print(f"{k} : {v:.3f}") - - run_module_via_rpc( - rpc_config=ARGS.rpc_config, - lib=rt_mod, - dev_type=ARGS.target.kind.name, - args=input_data, - continuation=f_per_layer, + continuation=create_timer(ARGS.backend), + backend=ARGS.backend, ) diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py index fedb27281a44..fe747af7972c 100644 --- a/python/tvm/auto_scheduler/testing/tune_relay.py +++ b/python/tvm/auto_scheduler/testing/tune_relay.py @@ -15,18 +15,18 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import json import os -from distutils.util import strtobool -import numpy as np # type: ignore import tvm from tvm import auto_scheduler from tvm import meta_schedule as ms from tvm import relay from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.meta_schedule.testing.relay_workload import get_network +from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer from tvm.meta_schedule.utils import cpu_count from tvm.support import describe @@ -94,17 +94,23 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + help="example: graph / vm", + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -133,6 +139,7 @@ def main(): repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, + timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": @@ -164,100 +171,62 @@ def main(): cache_dir=ARGS.cache_dir, ) input_info = {input_name: input_shape} - input_data = {} + input_data = { + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + } for input_name, input_shape in input_info.items(): - print(f" input_name: {input_name}") + print(f" input_name : {input_name}") print(f" input_shape: {input_shape}") print(f" input_dtype: {input_dtype}") - tasks, task_weights = auto_scheduler.extract_tasks( - mod["main"], - params, - target=ARGS.target, - hardware_params=hardware_params, - ) - for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): - print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====") - print(task.compute_dag) - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tuner.tune( - auto_scheduler.TuningOptions( - num_measure_trials=ARGS.num_trials, - runner=runner, - measure_callbacks=[ - auto_scheduler.RecordToFile(log_file), - ], - ), - adaptive_training=ARGS.adaptive_training, - ) - - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_auto_scheduler": True}, - ): - lib = relay.build( - mod, - target=ARGS.target, - params=params, + with ms.Profiler() as profiler: + tasks, task_weights = auto_scheduler.extract_tasks( + mod["main"], + params, + target=ARGS.target, + hardware_params=hardware_params, + ) + for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): + print( + f"==== Task {idx}: {task.desc} " + f"(weight {task_weight} key: {task.workload_key}) =====" ) - graph, rt_mod, params = lib.graph_json, lib.lib, lib.params - for input_name, input_shape in input_info.items(): - if input_dtype.startswith("float"): - input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype) - else: - input_data[input_name] = np.random.randint( - low=0, high=10000, size=input_shape, dtype=input_dtype + print(task.compute_dag) + + if ARGS.num_trials > 0: + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tuner.tune( + auto_scheduler.TuningOptions( + num_measure_trials=ARGS.num_trials, + runner=runner, + measure_callbacks=[ + auto_scheduler.RecordToFile(log_file), + ], + ), + adaptive_training=ARGS.adaptive_training, ) - def f_timer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.graph_executor import GraphModule - - # pylint: enable=import-outside-toplevel - - mod = GraphModule(rt_mod["default"](dev)) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - ftimer = mod.module.time_evaluator( - "run", - dev, - min_repeat_ms=500, - repeat=3, - ) - results = list(np.array(ftimer().results) * 1000.0) # type: ignore - print("Running time in time_evaluator: ", results) + relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend] + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_auto_scheduler": True}, + ): + lib = relay_build( + mod, + target=ARGS.target, + params=params, + ) + print("Tuning Time:") + print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, - continuation=f_timer, - ) - - def f_per_layer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.debugger.debug_executor import create - - # pylint: enable=import-outside-toplevel - mod = create(graph, rt_mod, dev) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] - graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) - print("|graph_nodes| = ", len(graph_nodes)) - print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)} - for k, v in graph_nodes_time.items(): - print(f"{k} : {v:.3f}") - - run_module_via_rpc( - rpc_config=ARGS.rpc_config, - lib=rt_mod, - dev_type=ARGS.target.kind.name, - args=input_data, - continuation=f_per_layer, + continuation=create_timer(ARGS.backend), + backend=ARGS.backend, ) diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py index c6a5ab27cfd8..da3584512dd0 100644 --- a/python/tvm/auto_scheduler/testing/tune_te.py +++ b/python/tvm/auto_scheduler/testing/tune_te.py @@ -15,12 +15,13 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import os -from distutils.util import strtobool import tvm from tvm import auto_scheduler +from tvm import meta_schedule as ms from tvm.meta_schedule.testing.te_workload import CONFIGS from tvm.meta_schedule.utils import cpu_count from tvm.support import describe @@ -79,20 +80,26 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, + required=False, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) + parsed.rpc_config = ms.runner.RPCConfig( + tracker_host=parsed.rpc_host, + tracker_port=parsed.rpc_port, + tracker_key=parsed.rpc_key, + session_timeout_sec=60, + ) return parsed @@ -100,12 +107,19 @@ def _parse_args(): def main(): - describe() - print(f"Workload: {ARGS.workload}") log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json") - workload_func, params = CONFIGS[ARGS.workload] - params = params[0] # type: ignore - workload_func = auto_scheduler.register_workload(workload_func) + + runner = auto_scheduler.RPCRunner( + key=ARGS.rpc_key, + host=ARGS.rpc_host, + port=ARGS.rpc_port, + n_parallel=cpu_count(logical=True), + number=ARGS.number, + repeat=ARGS.repeat, + min_repeat_ms=ARGS.min_repeat_ms, + enable_cpu_cache_flush=ARGS.cpu_flush, + timeout=ARGS.rpc_config.session_timeout_sec, + ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( @@ -127,37 +141,42 @@ def main(): ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") - task = auto_scheduler.SearchTask( - func=workload_func, - args=params, - target=ARGS.target, - hardware_params=hardware_params, - ) - runner = auto_scheduler.RPCRunner( - key=ARGS.rpc_key, - host=ARGS.rpc_host, - port=ARGS.rpc_port, - n_parallel=cpu_count(logical=True), - number=ARGS.number, - repeat=ARGS.repeat, - min_repeat_ms=ARGS.min_repeat_ms, - enable_cpu_cache_flush=ARGS.cpu_flush, - # todo(zxybazh): set session timeout to 60 same as MS - ) - # Inspect the computational graph - print("Computational DAG:") - print(task.compute_dag) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=ARGS.num_trials, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - verbose=2, - runner=runner, - ) - print("Running AutoTuning:") - task.tune(tune_option, adaptive_training=ARGS.adaptive_training) + describe() + print(f"Workload: {ARGS.workload}") + with ms.Profiler() as profiler: + # Same as MetaSchedule Tune TE + # Does not count ApplyHistoryBest time + + workload_func, params = CONFIGS[ARGS.workload] + params = params[0] # type: ignore + workload_func = auto_scheduler.register_workload(workload_func) + + task = auto_scheduler.SearchTask( + func=workload_func, + args=params, + target=ARGS.target, + hardware_params=hardware_params, + ) + # Inspect the computational graph + print("Computational DAG:") + print(task.compute_dag) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=ARGS.num_trials, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + verbose=2, + runner=runner, + ) + if ARGS.num_trials > 0: + print("Running AutoTuning:") + task.tune(tune_option, adaptive_training=ARGS.adaptive_training) + + print("Tuning Time:") + print(profiler.table()) + print("History Best:") print(task.print_best(log_file)) + sch, args = task.apply_best(log_file) print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True)) diff --git a/python/tvm/meta_schedule/cost_model/cost_model.py b/python/tvm/meta_schedule/cost_model/cost_model.py index e479cb725428..2fdb9b93494f 100644 --- a/python/tvm/meta_schedule/cost_model/cost_model.py +++ b/python/tvm/meta_schedule/cost_model/cost_model.py @@ -73,7 +73,7 @@ def update( _ffi_api.CostModelUpdate(self, context, candidates, results) # type: ignore # pylint: disable=no-member def predict(self, context: TuneContext, candidates: List[MeasureCandidate]) -> np.ndarray: - """Update the cost model given running results. + """Predict normalized score with the cost model. Parameters ---------- diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py index 3ba007d9a4d3..e203848c2cbb 100644 --- a/python/tvm/meta_schedule/testing/custom_builder_runner.py +++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py @@ -17,7 +17,7 @@ """Customized builder and runner methods""" # pylint: disable=import-outside-toplevel -from typing import TYPE_CHECKING, Callable, Dict, List +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union if TYPE_CHECKING: import numpy as np # type: ignore @@ -25,6 +25,7 @@ from tvm.meta_schedule.runner import EvaluatorConfig, RPCConfig from tvm.runtime import Device, Module, NDArray from tvm.target import Target + from tvm.runtime.vm import Executable def build_relay( @@ -143,10 +144,11 @@ def run_with_graph_executor( def run_module_via_rpc( rpc_config: "RPCConfig", - lib: "Module", + lib: Union["Module", "Executable"], dev_type: str, args: Dict[str, "np.ndarray"], continuation: Callable, + backend: Optional[str] = "graph", ): """Execute a tvm.runtime.Module on RPC remote""" # pylint: disable=import-outside-toplevel @@ -160,13 +162,15 @@ def run_module_via_rpc( with tempfile.TemporaryDirectory() as tmp_dir: filename = os.path.join(tmp_dir, "tvm_tmp_mod." + tar.output_format) + if backend == "vm": + code, lib = lib.save() lib.export_library(filename, tar) session = rpc_config.connect_server() session.upload(filename) _, filename = os.path.split(filename) rt_mod = session.load_module(filename) + if backend == "vm": + rt_mod = session.get_function("runtime.Load_Executable")(code, rt_mod) dev = session.device(dev_type=dev_type, dev_id=0) - nd_args = {} - for arg_key, arg_value in args.items(): - nd_args[arg_key] = ndarray.array(arg_value, dev) + nd_args = {k: ndarray.array(v, dev) for k, v in args.items()} return continuation(rt_mod, dev, nd_args) diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py index 8ae9ab1ed07d..6d473ed3237c 100644 --- a/python/tvm/meta_schedule/testing/tune_onnx.py +++ b/python/tvm/meta_schedule/testing/tune_onnx.py @@ -15,18 +15,18 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import json import logging - -from distutils.util import strtobool -import numpy as np # type: ignore import onnx # type: ignore + import tvm from tvm import meta_schedule as ms from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.relay.frontend import from_onnx from tvm.support import describe +from .tune_utils import generate_input_data, create_timer def _parse_args(): @@ -93,17 +93,23 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + help="example: graph / vm", + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -127,14 +133,19 @@ def _parse_args(): def main(): describe() print(f"Workload: {ARGS.model_name}") + onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: - print(f" input_name: {item['name']}") + print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) + input_data = { + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + } + runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( @@ -145,6 +156,7 @@ def main(): ), alloc_repeat=1, ) + with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, @@ -159,68 +171,18 @@ def main(): runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, + backend=ARGS.backend, ) print("Tuning Time:") print(profiler.table()) - graph, rt_mod, params = lib.graph_json, lib.lib, lib.params - input_data = {} - for item in ARGS.input_shape: - input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"] - if input_dtype.startswith("float"): - input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype) - else: - input_data[input_name] = np.random.randint( - low=0, high=10000, size=input_shape, dtype=input_dtype - ) - - def f_timer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.graph_executor import GraphModule - - # pylint: enable=import-outside-toplevel - - mod = GraphModule(rt_mod["default"](dev)) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - ftimer = mod.module.time_evaluator( - "run", - dev, - min_repeat_ms=500, - repeat=3, - ) - results = list(np.array(ftimer().results) * 1000.0) # type: ignore - print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, - continuation=f_timer, - ) - - def f_per_layer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.debugger.debug_executor import create - - # pylint: enable=import-outside-toplevel - mod = create(graph, rt_mod, dev) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] - graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) - print("|graph_nodes| = ", len(graph_nodes)) - print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)} - for k, v in graph_nodes_time.items(): - print(f"{k} : {v:.3f}") - - run_module_via_rpc( - rpc_config=ARGS.rpc_config, - lib=rt_mod, - dev_type=ARGS.target.kind.name, - args=input_data, - continuation=f_per_layer, + continuation=create_timer(ARGS.backend), + backend=ARGS.backend, ) diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py index daef48daa22f..8010e36fd656 100644 --- a/python/tvm/meta_schedule/testing/tune_relay.py +++ b/python/tvm/meta_schedule/testing/tune_relay.py @@ -15,16 +15,16 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import json import logging -from distutils.util import strtobool -import numpy as np # type: ignore import tvm from tvm import meta_schedule as ms from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.meta_schedule.testing.relay_workload import get_network +from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer from tvm.support import describe @@ -91,17 +91,23 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + help="example: graph / vm", + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -125,17 +131,21 @@ def _parse_args(): def main(): describe() print(f"Workload: {ARGS.workload}") + mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, cache_dir=ARGS.cache_dir, ) input_info = {input_name: input_shape} - input_data = {} + input_data = { + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + } for input_name, input_shape in input_info.items(): - print(f" input_name: {input_name}") + print(f" input_name : {input_name}") print(f" input_shape: {input_shape}") print(f" input_dtype: {input_dtype}") + runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( @@ -146,6 +156,7 @@ def main(): ), alloc_repeat=1, ) + with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, @@ -160,66 +171,19 @@ def main(): runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, + backend=ARGS.backend, ) + print("Tuning Time:") print(profiler.table()) - graph, rt_mod, params = lib.graph_json, lib.lib, lib.params - for input_name, input_shape in input_info.items(): - if input_dtype.startswith("float"): - input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype) - else: - input_data[input_name] = np.random.randint( - low=0, high=10000, size=input_shape, dtype=input_dtype - ) - - def f_timer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.graph_executor import GraphModule - - # pylint: enable=import-outside-toplevel - - mod = GraphModule(rt_mod["default"](dev)) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - ftimer = mod.module.time_evaluator( - "run", - dev, - min_repeat_ms=500, - repeat=3, - ) - results = list(np.array(ftimer().results) * 1000.0) # type: ignore - print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, - continuation=f_timer, - ) - - def f_per_layer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.debugger.debug_executor import create - - # pylint: enable=import-outside-toplevel - mod = create(graph, rt_mod, dev) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] - graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) - print("|graph_nodes| = ", len(graph_nodes)) - print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)} - for k, v in graph_nodes_time.items(): - print(f"{k} : {v:.3f}") - - run_module_via_rpc( - rpc_config=ARGS.rpc_config, - lib=rt_mod, - dev_type=ARGS.target.kind.name, - args=input_data, - continuation=f_per_layer, + continuation=create_timer(ARGS.backend), + backend=ARGS.backend, ) diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py index e579c561adaa..d54d92048ee6 100644 --- a/python/tvm/meta_schedule/testing/tune_te.py +++ b/python/tvm/meta_schedule/testing/tune_te.py @@ -15,14 +15,14 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import logging -from distutils.util import strtobool from typing import Optional import tvm -from tvm import meta_schedule as ms from tvm import tir +from tvm import meta_schedule as ms from tvm.meta_schedule.testing.te_workload import create_te_workload from tvm.support import describe @@ -80,17 +80,17 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, + required=False, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -138,8 +138,10 @@ def main(): task_name=ARGS.workload, work_dir=ARGS.work_dir, ) + print("Tuning Time:") print(profiler.table()) + if sch is None: print("No valid schedule found!") else: diff --git a/python/tvm/meta_schedule/testing/tune_utils.py b/python/tvm/meta_schedule/testing/tune_utils.py new file mode 100644 index 000000000000..aad8496a4661 --- /dev/null +++ b/python/tvm/meta_schedule/testing/tune_utils.py @@ -0,0 +1,194 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Testing utility functions in meta schedule""" +from typing import Callable, Optional, Union, List, Dict +from statistics import median +import json +import warnings +import numpy as np # type: ignore + +import tvm +from tvm.runtime import NDArray + + +def generate_input_data( + input_shape: List[int], + input_dtype: str, + *, + low: Optional[int] = None, + high: Optional[int] = None, +) -> np.ndarray: + """Generate input date with given shape and data type. + + Parameters + ---------- + input_shape : List[int] + The shape of the input data. + input_dtype : str + The data type of the input date. + + Returns + ------- + input_data : np.ndarray + The generated input data with given shape and data type in numpy ndarray. + """ + if input_dtype.startswith("float"): + return np.random.uniform(size=input_shape).astype(input_dtype) + if input_dtype in ["uint8", "int8"]: + return np.random.randint( + low=0, + high=127, + size=input_shape, + dtype="int32", # TODO(zxybazh): fix the datatype when int8 / uint8 is supported better + ) + if input_dtype in ["int32", "int64"]: + if low is None or high is None: + warnings.warn( + "Model input value range for shape {input_shape} of {input_dtype} is not set!" + ) + return np.random.randint( + low=0 if low is None else low, + high=10000 if high is None else high, + size=input_shape, + dtype=input_dtype, + ) + raise ValueError("Unsupported input datatype!") + + +def create_timer(backend: str) -> Callable: + """Create a function to run and benchmark the performance of whole given runtime module, + or Executable in relay vm. + + Parameters + ---------- + backend : str + The backend to use, graph / vm. + + Returns + ------- + func : Callable + The function to benchmark the workload. + """ + + def f_timer( + rt_mod: Union[tvm.runtime.Module, tvm.runtime.vm.Executable], + dev: tvm.device, + input_data: Dict[str, NDArray], + ) -> None: + """Run and benchmark the given runtime module, print out the result. + + Parameters + ---------- + rt_mod : Union[tvm.runtime.Module, tvm.runtime.vm.Executable] + The runtime module or vm executable. + dev : tvm.device + The device type to run workload. + input_data : Dict[str, np.ndarray] + The input data as a dictionary. + """ + from tvm.contrib.graph_executor import GraphModule # pylint:disable=import-outside-toplevel + from tvm.runtime.vm import VirtualMachine # pylint:disable=import-outside-toplevel + + try: + if backend == "vm": + vm = VirtualMachine(rt_mod, dev) # pylint: disable=invalid-name + ftimer = vm.benchmark( + dev, min_repeat_ms=500, repeat=5, number=1, end_to_end=False, **input_data + ) + elif backend == "graph": + mod = GraphModule(rt_mod["default"](dev)) + for input_name, input_value in input_data.items(): + mod.set_input(input_name, input_value) + ftimer = mod.module.time_evaluator( + "run", dev, min_repeat_ms=500, repeat=5, number=1 + )() + else: + raise ValueError(f"Backend {backend} not supported in f_timer!") + + results = list(np.array(ftimer.results) * 1000.0) # type: ignore + + print("Running time in time_evaluator: ", results) + print("-------------------------------") + print(f" Min (ms) : {min(results)}") + print(f" Max (ms) : {max(results)}") + print(f" Median (ms) : {median(results)}") + print(f"Average (ms) : {sum(results) / len(results)}") + except Exception as exc: # pylint: disable=broad-except + print( + f"Run module f_timer via RPC failed, exception: {exc}", + ) + + return f_timer + + +def create_time_per_layer(graph: str) -> Callable: + """Create a function to run and benchmark the per-layer performance of given runtime module, + given the graph output of the module from graph compiler. + + Parameters + ---------- + graph : str + The json format graph output of the module from graph compiler. + + Returns + ------- + func : Callable + The function using the json format graph. + """ + + def f_time_per_layer( + rt_mod: tvm.runtime.Module, + dev: tvm.device, + input_data: Dict[str, NDArray], + ) -> None: + """Run and benchmark the per-layer performance of given runtime module, + print out the result. + + Parameters + ---------- + rt_mod : tvm.runtime.Module + The runtime module. + dev : tvm.device + The device type to run workload. + input_data : Dict[str, np.ndarray] + The input data as a dictionary. + """ + # pylint:disable=import-outside-toplevel + from tvm.contrib.debugger.debug_executor import create + + # pylint:enable=import-outside-toplevel + + try: + mod = create(graph, rt_mod, dev) + for input_name, input_value in input_data.items(): + mod.set_input(input_name, input_value) + graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] + graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) + + print("Running time of each layer:") + print("---------------------------") + print("|graph_nodes| = ", len(graph_nodes)) + print("|graph_time| = ", len(graph_time)) + + for k, v in zip(graph_nodes, graph_time): + print(k, float(v) * 1e6, "us") + except Exception as exc: # pylint: disable=broad-except + print( + f"Run module f_time_per_layer via RPC failed, exception: {exc}", + ) + + return f_time_per_layer diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py index bdd3852e40a3..0d011b726473 100644 --- a/python/tvm/meta_schedule/testing/utils.py +++ b/python/tvm/meta_schedule/testing/utils.py @@ -16,13 +16,12 @@ # under the License. """Testing utility functions in meta schedule""" from typing import Callable, Dict, Optional, Union - -from tvm import meta_schedule as ms from tvm.ir import IRModule from tvm.relay import Function as RelayFunc from tvm.runtime import NDArray from tvm.target import Target from tvm.tir import Schedule +from tvm import meta_schedule as ms def apply_fixed_schedules( diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py index fabf14ab23c7..cd40429d1684 100644 --- a/python/tvm/meta_schedule/tune.py +++ b/python/tvm/meta_schedule/tune.py @@ -24,7 +24,7 @@ from tvm.ir import IRModule from tvm.ir.transform import PassContext -from tvm.runtime import Module, NDArray +from tvm.runtime import Module, NDArray, vm from tvm.target import Target from tvm.te import Tensor, create_prim_func from tvm.tir import PrimFunc, Schedule @@ -346,8 +346,9 @@ def tune_extracted_tasks( cost_model=cost_model, measure_callbacks=measure_callbacks, ) - task_scheduler.tune() - cost_model.save(osp.join(work_dir, "cost_model.xgb")) + if config.max_trials_global > 0: + task_scheduler.tune() + cost_model.save(osp.join(work_dir, "cost_model.xgb")) return database @@ -516,6 +517,7 @@ def tune_relay( config: TuneConfig, work_dir: str, *, + backend: str = "graph", params: Optional[Dict[str, NDArray]] = None, builder: Optional[Builder] = None, runner: Optional[Runner] = None, @@ -527,7 +529,7 @@ def tune_relay( postprocs: Optional[FnPostproc] = None, mutator_probs: Optional[FnMutatorProb] = None, num_threads: Optional[int] = None, -) -> Module: +) -> Union[Module, vm.Executable]: """Tune a TIR IRModule with a given target. Parameters @@ -552,15 +554,16 @@ def tune_relay( The database to use. measure_callbacks : Optional[List[MeasureCallback]] The callbacks used during tuning. + backend : str = "graph" + The backend to use for relay compilation(graph / vm). Returns ------- - lib : Module - The built runtime module for the given relay workload. + lib : Union[Module, tvm.runtime.vm.Executable] + The built runtime module or vm Executable for the given relay workload. """ # pylint: disable=import-outside-toplevel - from tvm.relay import build as relay_build - + from tvm import relay from .relay_integration import extract_task_from_relay # pylint: disable=protected-access, enable=import-outside-toplevel @@ -584,6 +587,7 @@ def tune_relay( mutator_probs=mutator_probs, num_threads=num_threads, ) + relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend] with Profiler.timeit("ApplyHistoryBest"): with target, autotvm_silencer(), ApplyHistoryBest(database): with PassContext(