From a8d490d7a5427256e8968a3a3554eeb4aa9a3d89 Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Sat, 11 Feb 2023 20:11:48 -0500 Subject: [PATCH] [Fix][MetaSchedule] RPCRunner timeout when queueing up This PR fixes the timeout rule of MetaSchedule RPCRunner. Prior to this PR, the RPCRunner sets a timeout threshold for jobs submitted to popen pool. As a result, the jobs are timed since the time that they are sent to the remote side. Consider the case where there is only a single device for measurement. In this case, all jobs can only be executed serially and jobs must queue up. Therefore, the previous timeout configuration means the time spent on queueing will also be counted. This causes some jobs, in the worst cases, gets timeout without even started to execute, and has negative impacts on RPC MetaSchedule tuning, from the perspectives of both efficiency and result performance. To address the problem, this PR * removes the RPCRunner timeout requirement for PopenExecutor. Since now, the RPCRunner timeout will only be raised from the RPC server side. * removes the timeout information in RPCRunnerFuture, as timeout is no longer issued by popen side, as mentioned in the point above. * changes the default maximum number of RPCRunner workers to 1, which is to avoid the case where too many jobs waiting in line. Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> --- 3rdparty/cutlass | 2 +- python/tvm/meta_schedule/runner/rpc_runner.py | 20 ++------- .../unittest/test_meta_schedule_runner.py | 41 ++++++++++++------- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/3rdparty/cutlass b/3rdparty/cutlass index c2ee13a0fe992..5ff5209ed5cf0 160000 --- a/3rdparty/cutlass +++ b/3rdparty/cutlass @@ -1 +1 @@ -Subproject commit c2ee13a0fe99241b0e798ce647acf98e237f1d0c +Subproject commit 5ff5209ed5cf006f2713e4c5406e530b88fbf553 diff --git a/python/tvm/meta_schedule/runner/rpc_runner.py b/python/tvm/meta_schedule/runner/rpc_runner.py index 9bdf715756cc5..22bec1e8ec788 100644 --- a/python/tvm/meta_schedule/runner/rpc_runner.py +++ b/python/tvm/meta_schedule/runner/rpc_runner.py @@ -27,7 +27,6 @@ from ..logging import get_logger from ..profiler import Profiler from ..utils import ( - cpu_count, derived_object, get_global_func_on_rpc_session, get_global_func_with_default_on_worker, @@ -92,26 +91,20 @@ class RPCRunnerFuture(PyRunnerFuture): ---------- future: concurrent.futures.Future The concurrent function to check when the function is done and to return the result. - timeout_sec: float - The timeout in seconds. """ future: concurrent.futures.Future - timeout_sec: float - def __init__(self, future: concurrent.futures.Future, timeout_sec: float) -> None: + def __init__(self, future: concurrent.futures.Future) -> None: """Constructor Parameters ---------- future: concurrent.futures.Future The concurrent function to check when the function is done and to return the result. - timeout_sec: float - The timeout in seconds. """ super().__init__() self.future = future - self.timeout_sec = timeout_sec def done(self) -> bool: return self.future.done() @@ -119,11 +112,6 @@ def done(self) -> bool: def result(self) -> RunnerResult: try: run_secs: List[float] = self.future.result() - except TimeoutError: - return RunnerResult( - None, - error_msg=f"RPCRunner: Timeout, killed after {self.timeout_sec} seconds", - ) except Exception as exception: # pylint: disable=broad-except return RunnerResult( None, @@ -270,7 +258,7 @@ def __init__( f_cleanup: Union[T_CLEANUP, str, None] The function name to cleanup the session or the function itself. max_workers: Optional[int] = None - The maximum number of connections. Defaults to number of logical CPU cores. + The maximum number of connections. Defaults to 1. initializer: Optional[Callable[[], None]] The initializer function. """ @@ -285,11 +273,10 @@ def __init__( self.f_run_evaluator = f_run_evaluator self.f_cleanup = f_cleanup if max_workers is None: - max_workers = cpu_count(logical=True) + max_workers = 1 logger.info("RPCRunner: max_workers = %d", max_workers) self.pool = PopenPoolExecutor( max_workers=max_workers, - timeout=rpc_config.session_timeout_sec, initializer=initializer, ) self._sanity_check() @@ -312,7 +299,6 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]: str(runner_input.device_type), tuple(arg_info.as_json() for arg_info in runner_input.args_info), ), - timeout_sec=self.rpc_config.session_timeout_sec, ) results.append(future) # type: ignore return results diff --git a/tests/python/unittest/test_meta_schedule_runner.py b/tests/python/unittest/test_meta_schedule_runner.py index a79498304b2f5..59c7d5441efa3 100644 --- a/tests/python/unittest/test_meta_schedule_runner.py +++ b/tests/python/unittest/test_meta_schedule_runner.py @@ -131,6 +131,23 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None: # pylint: disable=no-s C[vi] = A[vi] + B[vi] +# A huge matmul that must cause timeout in the timeout test below. +@tvm.script.ir_module +class MatmulHugeModule: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle) -> None: # pylint: disable=no-self-argument + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, (4096, 4096), "float32") + B = T.match_buffer(b, (4096, 4096), "float32") + C = T.match_buffer(c, (4096, 4096), "float32") + for i, j, k in T.grid(4096, 4096, 4096): + with T.block("matmul"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = 0.0 + C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj] + + # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring @@ -372,22 +389,20 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]: def test_meta_schedule_rpc_runner_time_out(): - """Test meta schedule RPC Runner time out""" + """Test meta schedule RPC Runner time out by using a super large workload""" - def initializer(): - @register_func("meta_schedule.runner.test_time_out") - def timeout_session_creator( # pylint: disable=unused-variable - rpc_config: RPCConfig, # pylint: disable=unused-argument - ) -> RPCSession: - time.sleep(2) + builder = LocalBuilder() + builder_inputs = [BuilderInput(MatmulHugeModule, Target("llvm"))] + builder_results = builder.build(builder_inputs) + builder_results[0].artifact_path runner_input = RunnerInput( - "test", + builder_results[0].artifact_path, "llvm", [ - TensorInfo("float32", (MATMUL_N, MATMUL_N)), - TensorInfo("float32", (MATMUL_N, MATMUL_N)), - TensorInfo("float32", (MATMUL_N, MATMUL_N)), + TensorInfo("float32", (4096, 4096)), + TensorInfo("float32", (4096, 4096)), + TensorInfo("float32", (4096, 4096)), ], ) @@ -408,15 +423,13 @@ def timeout_session_creator( # pylint: disable=unused-variable runner = RPCRunner( rpc_config, evaluator_config, - initializer=initializer, - f_create_session="meta_schedule.runner.test_time_out", ) # Run the module (runner_future,) = runner.run([runner_input]) runner_result = runner_future.result() assert runner_result.error_msg is not None and runner_result.error_msg.startswith( - "RPCRunner: Timeout, killed after" + "RPCRunner: An exception occurred" ) assert runner_result.run_secs is None