From a8d490d7a5427256e8968a3a3554eeb4aa9a3d89 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Sat, 11 Feb 2023 20:11:48 -0500
Subject: [PATCH] [Fix][MetaSchedule] RPCRunner timeout when queueing up

This PR fixes the timeout rule of MetaSchedule RPCRunner.

Prior to this PR, the RPCRunner sets a timeout threshold for jobs
submitted to popen pool. As a result, the jobs are timed since the
time that they are sent to the remote side.

Consider the case where there is only a single device for measurement.
In this case, all jobs can only be executed serially and jobs must
queue up. Therefore, the previous timeout configuration means the
time spent on queueing will also be counted. This causes some jobs,
in the worst cases, gets timeout without even started to execute,
and has negative impacts on RPC MetaSchedule tuning, from the
perspectives of both efficiency and result performance.

To address the problem, this PR
* removes the RPCRunner timeout requirement for PopenExecutor. Since
now, the RPCRunner timeout will only be raised from the RPC server
side.
* removes the timeout information in RPCRunnerFuture, as timeout is
no longer issued by popen side, as mentioned in the point above.
* changes the default maximum number of RPCRunner workers to 1, which
is to avoid the case where too many jobs waiting in line.

Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
---
 3rdparty/cutlass                              |  2 +-
 python/tvm/meta_schedule/runner/rpc_runner.py | 20 ++-------
 .../unittest/test_meta_schedule_runner.py     | 41 ++++++++++++-------
 3 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index c2ee13a0fe992..5ff5209ed5cf0 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit c2ee13a0fe99241b0e798ce647acf98e237f1d0c
+Subproject commit 5ff5209ed5cf006f2713e4c5406e530b88fbf553
diff --git a/python/tvm/meta_schedule/runner/rpc_runner.py b/python/tvm/meta_schedule/runner/rpc_runner.py
index 9bdf715756cc5..22bec1e8ec788 100644
--- a/python/tvm/meta_schedule/runner/rpc_runner.py
+++ b/python/tvm/meta_schedule/runner/rpc_runner.py
@@ -27,7 +27,6 @@
 from ..logging import get_logger
 from ..profiler import Profiler
 from ..utils import (
-    cpu_count,
     derived_object,
     get_global_func_on_rpc_session,
     get_global_func_with_default_on_worker,
@@ -92,26 +91,20 @@ class RPCRunnerFuture(PyRunnerFuture):
     ----------
     future: concurrent.futures.Future
         The concurrent function to check when the function is done and to return the result.
-    timeout_sec: float
-        The timeout in seconds.
     """
 
     future: concurrent.futures.Future
-    timeout_sec: float
 
-    def __init__(self, future: concurrent.futures.Future, timeout_sec: float) -> None:
+    def __init__(self, future: concurrent.futures.Future) -> None:
         """Constructor
 
         Parameters
         ----------
         future: concurrent.futures.Future
             The concurrent function to check when the function is done and to return the result.
-        timeout_sec: float
-            The timeout in seconds.
         """
         super().__init__()
         self.future = future
-        self.timeout_sec = timeout_sec
 
     def done(self) -> bool:
         return self.future.done()
@@ -119,11 +112,6 @@ def done(self) -> bool:
     def result(self) -> RunnerResult:
         try:
             run_secs: List[float] = self.future.result()
-        except TimeoutError:
-            return RunnerResult(
-                None,
-                error_msg=f"RPCRunner: Timeout, killed after {self.timeout_sec} seconds",
-            )
         except Exception as exception:  # pylint: disable=broad-except
             return RunnerResult(
                 None,
@@ -270,7 +258,7 @@ def __init__(
         f_cleanup: Union[T_CLEANUP, str, None]
             The function name to cleanup the session or the function itself.
         max_workers: Optional[int] = None
-            The maximum number of connections. Defaults to number of logical CPU cores.
+            The maximum number of connections. Defaults to 1.
         initializer: Optional[Callable[[], None]]
             The initializer function.
         """
@@ -285,11 +273,10 @@ def __init__(
         self.f_run_evaluator = f_run_evaluator
         self.f_cleanup = f_cleanup
         if max_workers is None:
-            max_workers = cpu_count(logical=True)
+            max_workers = 1
         logger.info("RPCRunner: max_workers = %d", max_workers)
         self.pool = PopenPoolExecutor(
             max_workers=max_workers,
-            timeout=rpc_config.session_timeout_sec,
             initializer=initializer,
         )
         self._sanity_check()
@@ -312,7 +299,6 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
                     str(runner_input.device_type),
                     tuple(arg_info.as_json() for arg_info in runner_input.args_info),
                 ),
-                timeout_sec=self.rpc_config.session_timeout_sec,
             )
             results.append(future)  # type: ignore
         return results
diff --git a/tests/python/unittest/test_meta_schedule_runner.py b/tests/python/unittest/test_meta_schedule_runner.py
index a79498304b2f5..59c7d5441efa3 100644
--- a/tests/python/unittest/test_meta_schedule_runner.py
+++ b/tests/python/unittest/test_meta_schedule_runner.py
@@ -131,6 +131,23 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
                 C[vi] = A[vi] + B[vi]
 
 
+# A huge matmul that must cause timeout in the timeout test below.
+@tvm.script.ir_module
+class MatmulHugeModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, (4096, 4096), "float32")
+        B = T.match_buffer(b, (4096, 4096), "float32")
+        C = T.match_buffer(c, (4096, 4096), "float32")
+        for i, j, k in T.grid(4096, 4096, 4096):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
 
 
@@ -372,22 +389,20 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
 
 
 def test_meta_schedule_rpc_runner_time_out():
-    """Test meta schedule RPC Runner time out"""
+    """Test meta schedule RPC Runner time out by using a super large workload"""
 
-    def initializer():
-        @register_func("meta_schedule.runner.test_time_out")
-        def timeout_session_creator(  # pylint: disable=unused-variable
-            rpc_config: RPCConfig,  # pylint: disable=unused-argument
-        ) -> RPCSession:
-            time.sleep(2)
+    builder = LocalBuilder()
+    builder_inputs = [BuilderInput(MatmulHugeModule, Target("llvm"))]
+    builder_results = builder.build(builder_inputs)
+    builder_results[0].artifact_path
 
     runner_input = RunnerInput(
-        "test",
+        builder_results[0].artifact_path,
         "llvm",
         [
-            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
-            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
-            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (4096, 4096)),
+            TensorInfo("float32", (4096, 4096)),
+            TensorInfo("float32", (4096, 4096)),
         ],
     )
 
@@ -408,15 +423,13 @@ def timeout_session_creator(  # pylint: disable=unused-variable
         runner = RPCRunner(
             rpc_config,
             evaluator_config,
-            initializer=initializer,
-            f_create_session="meta_schedule.runner.test_time_out",
         )
         # Run the module
         (runner_future,) = runner.run([runner_input])
         runner_result = runner_future.result()
 
     assert runner_result.error_msg is not None and runner_result.error_msg.startswith(
-        "RPCRunner: Timeout, killed after"
+        "RPCRunner: An exception occurred"
     )
     assert runner_result.run_secs is None