apache · jwfromm · Aug 26, 2021 · Aug 19, 2021 · Aug 20, 2021 · Aug 23, 2021
diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
@@ -320,3 +320,62 @@ def __getitem__(self, key):
             The key to the module.
         """
         return self.module[key]
+
+    def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=None, **kwargs):
+        """Calculate runtime of a function by repeatedly calling it.
+
+        Use this function to get an accurate measurement of the runtime of a function. The function
+        is run multiple times in order to account for variability in measurements, processor speed
+        or other external factors.  Mean, median, standard deviation, min and max runtime are all
+        reported.  On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
+        synchonization and data transfer operations are not counted towards the runtime. This allows
+        for fair comparison of runtimes across different functions and models.
+
+        The benchmarking loop looks approximately like so:
+
+        .. code-block:: python
+
+            for r in range(repeat):
+                time_start = now()
+                for n in range(number):
+                    func_name()
+                time_end = now()
+                total_times.append((time_end - time_start)/number)
+
+
+        Parameters
+        ----------
+        func_name : str
+            The function to benchmark
+
+        repeat : int
+            Number of times to run the outer loop of the timing code (see above). The output will
+            contain `repeat` number of datapoints.
+
+        number : int
+            Number of times to run the inner loop of the timing code. This inner loop is run in
+            between the timer starting and stopping. In order to amortize any timing overhead,
+            `number` should be increased when the runtime of the function is small (less than a 1/10
+            of a millisecond).
+
+        min_repeat_ms : Optional[float]
+            If set, the inner loop will be run until it takes longer than `min_repeat_ms`
+            milliseconds. This can be used to ensure that the function is run enough to get an
+            accurate measurement.
+
+        kwargs : Dict[str, Object]
+            Named arguments to the function. These are cached before running timing code, so that
+            data transfer costs are not counted in the runtime.
+
+        Returns
+        -------
+        timing_results : ProfileResult
+            Runtimes of the function. Use `.mean` to access the mean runtime, use `.results` to
+            access the individual runtimes.
+        """
+        min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
+        if kwargs:
+            self.set_input(**kwargs)
+        return self.module.time_evaluator(
+            func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
+        )()
diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
@@ -46,14 +46,15 @@
 import os
 import tarfile
 import json
-from typing import Optional, Union, List, Dict, Callable, TextIO
+from typing import Optional, Union, Dict, Callable, TextIO
 import numpy as np
 
 import tvm
 import tvm.contrib.cc
 from tvm import relay
 from tvm.contrib import utils
 from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
+from tvm.runtime.module import BenchmarkResult
 
 try:
     from tvm.micro import export_model_library_format
@@ -371,14 +372,14 @@ def import_package(self, package_path: str):
 class TVMCResult(object):
     """A class that stores the results of tvmc.run and provides helper utilities."""
 
-    def __init__(self, outputs: Dict[str, np.ndarray], times: List[float]):
+    def __init__(self, outputs: Dict[str, np.ndarray], times: BenchmarkResult):
         """Create a convenience wrapper around the output of tvmc.run
 
         Parameters
         ----------
         outputs : dict
             Outputs dictionary mapping the name of the output to its numpy value.
-        times : list of float
+        times : BenchmarkResult
             The execution times measured by the time evaluator in seconds to produce outputs.
         """
         self.outputs = outputs
@@ -390,29 +391,15 @@ def format_times(self):
         This has the effect of producing a small table that looks like:
         .. code-block::
             Execution time summary:
-            mean (ms)   max (ms)    min (ms)    std (ms)
-            0.14310    0.16161    0.12933    0.01004
+            mean (ms)  median (ms) max (ms)    min (ms)    std (ms)
+            0.14310      0.14310   0.16161     0.12933    0.01004
 
         Returns
         -------
         str
             A formatted string containing the statistics.
         """
-
-        # timestamps
-        mean_ts = np.mean(self.times) * 1000
-        std_ts = np.std(self.times) * 1000
-        max_ts = np.max(self.times) * 1000
-        min_ts = np.min(self.times) * 1000
-
-        header = "Execution time summary:\n{0:^10} {1:^10} {2:^10} {3:^10}".format(
-            "mean (ms)", "max (ms)", "min (ms)", "std (ms)"
-        )
-        stats = "{0:^10.2f} {1:^10.2f} {2:^10.2f} {3:^10.2f}".format(
-            mean_ts, max_ts, min_ts, std_ts
-        )
-
-        return "%s\n%s\n" % (header, stats)
+        return str(times)
 
     def get_output(self, name: str):
         """A helper function to grab one of the outputs by name.

diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
@@ -421,12 +421,8 @@ def run_module(
         # This print is intentional
         print(report)
 
-    # create the module time evaluator (returns a function)
-    timer = module.module.time_evaluator("run", dev, number=number, repeat=repeat)
-    # call the evaluator function to invoke the module and save execution times
-    prof_result = timer()
-    # collect a list of execution times from the profiling results
-    times = prof_result.results
+    # call the benchmarking function of the executor
+    times = module.benchmark(dev, number=number, repeat=repeat)
 
     logger.debug("Collecting the output tensors.")
     num_outputs = module.get_num_outputs()

diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
@@ -20,7 +20,8 @@
 import os
 import ctypes
 import struct
-from collections import namedtuple
+from typing import Sequence
+import numpy as np
 
 import tvm._ffi
 from tvm._ffi.base import _LIB, check_call, c_str, string_types, _RUNTIME_ONLY
@@ -30,8 +31,65 @@
 from . import _ffi_api
 
 
-# profile result of time evaluator
-ProfileResult = namedtuple("ProfileResult", ["mean", "results"])
+class BenchmarkResult:
+    """Runtimes from benchmarking"""
+
+    def __init__(self, results: Sequence[float]):
+        """Construct a new BenchmarkResult from a sequence of runtimes.
+
+        Parameters
+        ----------
+        results : Sequence[float]
+            Raw times from benchmarking
+
+        Attributes
+        ----------
+        min : float
+            Minimum runtime in seconds of all results.
+        mean : float
+            Mean runtime in seconds of all results. Note that this mean is not
+            necessarily statistically correct as it is the mean of mean
+            runtimes.
+        median : float
+            Median runtime in seconds of all results. Note that this is not necessarily
+            statistically correct as it is the median of mean runtimes.
+        max : float
+            Maximum runtime in seconds of all results.
+        std : float
+            Standard deviation in seconds of runtimes. Note that this is not necessarily
+            correct as it is the std of mean runtimes.
+        results : Sequence[float]
+            The collected runtimes (in seconds). This may be a series of mean runtimes if
+            the benchmark was run with `number` > 1.
+        """
+        self.results = results
+        self.mean = np.mean(self.results)
+        self.std = np.std(self.results)
+        self.median = np.median(self.results)
+        self.min = np.min(self.results)
+        self.max = np.max(self.results)
+
+    def __repr__(self):
+        return "BenchmarkResult(min={}, mean={}, median={}, max={}, std={}, results={})".format(
+            self.min, self.mean, self.median, self.max, self.std, self.results
+        )
+
+    def __str__(self):
+        return """Execution time summary:
+{:^12} {:^12} {:^12} {:^12} {:^12}
+{:^12.2f} {:^12.2f} {:^12.2f} {:^12.2f} {:^12.2f}
+               """.format(
+            "mean (ms)",
+            "median (ms)",
+            "max (ms)",
+            "min (ms)",
+            "std (ms)",
+            self.mean * 1000,
+            self.median * 1000,
+            self.max * 1000,
+            self.min * 1000,
+            self.std * 1000,
+        )
 
 
 class Module(object):
@@ -209,7 +267,7 @@ def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f
         Returns
         -------
         ftimer : function
-            The function that takes same argument as func and returns a ProfileResult.
+            The function that takes same argument as func and returns a BenchmarkResult.
             The ProfileResult reports `repeat` time costs in seconds.
         """
         try:
@@ -230,8 +288,7 @@ def evaluator(*args):
                 blob = feval(*args)
                 fmt = "@" + ("d" * repeat)
                 results = struct.unpack(fmt, blob)
-                mean = sum(results) / float(repeat)
-                return ProfileResult(mean=mean, results=results)
+                return BenchmarkResult(results)
 
             return evaluator
         except NameError:

diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
@@ -507,3 +507,67 @@ def get_input_index(self, input_name, func_name="main"):
           The input index. -1 will be returned if the given input name is not found.
         """
         return self._get_input_index(input_name, func_name)
+
+    def benchmark(
+        self, device, *args, func_name="main", repeat=5, number=5, min_repeat_ms=None, **kwargs
+    ):
+        """Calculate runtime of a function by repeatedly calling it.
+
+        Use this function to get an accurate measurement of the runtime of a function. The function
+        is run multiple times in order to account for variability in measurements, processor speed
+        or other external factors.  Mean, median, standard deviation, min and max runtime are all
+        reported.  On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
+        synchonization and data transfer operations are not counted towards the runtime. This allows
+        for fair comparison of runtimes across different functions and models.
+
+        The benchmarking loop looks approximately like so:
+
+        .. code-block:: python
+
+            for r in range(repeat):
+                time_start = now()
+                for n in range(number):
+                    func_name()
+                time_end = now()
+                total_times.append((time_end - time_start)/number)
+
+
+        Parameters
+        ----------
+        func_name : str
+            The function to benchmark
+
+        repeat : int
+            Number of times to run the outer loop of the timing code (see above). The output will
+            contain `repeat` number of datapoints.
+
+        number : int
+            Number of times to run the inner loop of the timing code. This inner loop is run in
+            between the timer starting and stopping. In order to amortize any timing overhead,
+            `number` should be increased when the runtime of the function is small (less than a 1/10
+            of a millisecond).
+
+        min_repeat_ms : Optional[float]
+            If set, the inner loop will be run until it takes longer than `min_repeat_ms`
+            milliseconds. This can be used to ensure that the function is run enough to get an
+            accurate measurement.
+
+        args : Sequence[Object]
+            Arguments to the function. These are cached before running timing code, so that data
+            transfer costs are not counted in the runtime.
+
+        kwargs : Dict[str, Object]
+            Named arguments to the function. These are cached like `args`.
+
+        Returns
+        -------
+        timing_results : ProfileResult
+            Runtimes of the function. Use `.mean` to access the mean runtime, use `.results` to
+            access the individual runtimes.
+        """
+        min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
+        if args or kwargs:
+            self.set_input(func_name, *args, **kwargs)
+        return self.module.time_evaluator(
+            "invoke", device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
+        )(func_name)
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
@@ -417,8 +417,9 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
                 << "Cannot find " << f_preproc_name << " in the global function";
             f_preproc = *pf_preproc;
           }
-          return WrapTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms,
-                                   f_preproc);
+          PackedFunc pf = m.GetFunction(name, false);
+          CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
+          return WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, f_preproc);
         }
       } else {
         auto* pf = runtime::Registry::Get(name);

diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
@@ -23,6 +23,7 @@
 from tvm.contrib import graph_executor
 from tvm.relay.op import add
 import tvm.testing
+from tvm.relay.testing import mlp
 
 # @tq, @jr should we put this in testing ns?
 def check_rts(expr, args, expected_result, mod=None):
@@ -322,5 +323,17 @@ def test_graph_executor_api():
     assert mod.get_input_index("Invalid") == -1
 
 
+@tvm.testing.requires_llvm
+def test_benchmark():
+    mod, params = mlp.get_workload(1)
+    lib = relay.build(mod, target="llvm", params=params)
+    exe = graph_executor.create(lib.get_graph_json(), lib.lib, tvm.cpu())
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"))
+    result = exe.benchmark(tvm.cpu(), data=data, func_name="run", repeat=2, number=1)
+    assert result.mean == result.median
+    assert result.mean > 0
+    assert len(result.results) == 2
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
@@ -30,6 +30,7 @@
 from tvm import rpc
 import tvm.testing
 from tvm.relay.transform import InferType
+from tvm.relay.testing import mlp
 
 
 def check_result(args, expected_result, mod=None):
@@ -955,5 +956,17 @@ def test_get_input_index():
     assert vm_factory.get_input_index("invalid") == -1
 
 
+@tvm.testing.requires_llvm
+def test_benchmark():
+    mod, params = mlp.get_workload(1)
+    lib = vm.compile(mod, target="llvm", params=params)
+    exe = runtime.vm.VirtualMachine(lib, tvm.cpu())
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"))
+    result = exe.benchmark(tvm.cpu(), data, func_name="main", repeat=2, number=1)
+    assert result.mean == result.median
+    assert result.mean > 0
+    assert len(result.results) == 2
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/tutorials/auto_scheduler/tune_network_arm.py
@@ -349,11 +349,7 @@ def tune_and_evaluate():
 
     # Evaluate
     print("Evaluate inference time cost...")
-    ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500)
-    prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
-    print(
-        "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))
-    )
+    print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
 
 
 # We do not run the tuning in our webpage server since the server doesn't have a Raspberry Pi,

diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -288,9 +288,7 @@ def run_tuning():
 
 # Evaluate
 print("Evaluate inference time cost...")
-ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500)
-prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
-print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
+print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
 
 
 #################################################################