From 22d8a8dca33ab53acad8f7dce6554bd2e01dfd7a Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 21 Jun 2023 09:19:59 -0500
Subject: [PATCH 1/2] [UnitTest][NVPTX] Avoid cascading failures from CUDA
 postproc

Prior to this commit, the tests in
`test_tir_transform_inject_ptx_async_copy.py` registered the
`"tvm_callback_cuda_postproc"` function during pytest collection, and
used a global variable to disable its functionality outside of the
tests in this file.  This had two major issues.  First, if any other
test also installs a postproc function, these postproc function
required by the NVPTX tests would be overwritten.  Second, if one of
the NTPTX tests fails, the global variable controlling the postproc
function would not be reset, causing any subsequent CUDA-related tests
to also fail.

This commit updates these NVPTX tests to conditionally install the
postproc function, to de-register it after the test instead of
disabling its functionality, and to de-register it regardless of the
test result.

This issue was initially found when debugging
https://github.com/apache/tvm/pull/15103, when a failure in
`test_tir_transform_inject_ptx_async_copy.py::test_cp_async_in_if_then_else`
caused failures in 32 unrelated tests ([CI
link](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm-gpu/detail/PR-15103/7/tests)).
---
 ...est_tir_transform_inject_ptx_async_copy.py | 83 ++++++++-----------
 1 file changed, 36 insertions(+), 47 deletions(-)

diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
index 1e1ef410b4e1..e15ef6402d6b 100644
--- a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
+++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
@@ -14,11 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import numpy as np
+
 import tvm
 import tvm.testing
 from tvm.script import tir as T
 
+import pytest
+import numpy as np
+
 
 def count_cp_async(stmt):
     num_alloc = [0]
@@ -351,36 +354,38 @@ def test_inject_async_copy_shared_dyn():
 """
 
 
-generated_code = ""
-support_async = True
+@pytest.fixture
+def postproc_if_missing_async_support():
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    support_async = major >= 8
 
+    func_name = "tvm_callback_cuda_postproc"
+    prev_postproc = None
 
-@tvm.register_func
-def tvm_callback_cuda_postproc(code, _):
-    global generated_code
-    global support_async
-    generated_code = code
-    # return a dummy code so that device < sm80 could build correctly
     if not support_async:
-        ret = ""
-        for line in code.split("\n"):
-            ret += line + "\n"
-            if line.startswith('extern "C" __global__'):
-                break
-        ret += "}"
-        return ret
-    return code
+        prev_postproc = tvm.get_global_func(func_name, allow_missing=True)
 
+        @tvm.register_func(func_name, override=True)
+        def tvm_callback_cuda_postproc(code, _):
+            ret = []
+            for line in code.split("\n"):
+                ret.append(line)
+                ret.append("\n")
+                if line.startswith('extern "C" __global__') and line.endswith("{"):
+                    break
+            ret.append("}")
+            return "".join(ret)
 
-@tvm.testing.requires_cuda
-def test_cp_async_in_if_then_else():
-    global support_async
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # At least sm80 is required
-        support_async = False
+    yield
+
+    # Restore previous postproc func to avoid impacting other tests
+    if prev_postproc is not None:
+        tvm.register_func(func_name, prev_postproc, override=True)
 
+
+@tvm.testing.requires_cuda
+def test_cp_async_in_if_then_else(postproc_if_missing_async_support):
     @T.prim_func
     def simple_compute(
         A: T.Buffer((16, 14), "float32"),
@@ -421,23 +426,13 @@ def simple_compute(
 
     mod = tvm.IRModule.from_expr(simple_compute)
     with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-        tvm.build(mod, target="cuda")
+        built = tvm.build(mod, target="cuda")
+    generated_code = built.imported_modules[0].get_source()
     assert generated_code == expected_cuda_script
 
-    if not support_async:
-        # avoid return dummy code to other tests
-        support_async = True
-
 
 @tvm.testing.requires_cuda
-def test_vectorize_cp_async_in_if_then_else():
-    global support_async
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    if major < 8:
-        # At least sm80 is required
-        support_async = False
-
+def test_vectorize_cp_async_in_if_then_else(postproc_if_missing_async_support):
     @T.prim_func
     def complex_compute(
         A: T.Buffer((2, 16, 16, 1280), "float16"),
@@ -886,17 +881,11 @@ def complex_compute(
 
     mod = tvm.IRModule.from_expr(complex_compute)
     with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-        tvm.build(mod, target="cuda")
+        built = tvm.build(mod, target="cuda")
+    generated_code = built.imported_modules[0].get_source()
     # generated_code must contain "  setp.ne.b32 p, %0, 0;"
     assert "setp.ne.b32" in generated_code
 
-    if not support_async:
-        # avoid return dummy code to other tests
-        support_async = True
-
 
 if __name__ == "__main__":
-    test_inject_async_copy()
-    test_inject_async_copy_shared_dyn()
-    test_cp_async_in_if_then_else()
-    test_vectorize_cp_async_in_if_then_else()
+    tvm.testing.main()

From f302f3ad7cbe1e6c9554415a9f0eeaa929804603 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Tue, 4 Jul 2023 09:41:57 -0500
Subject: [PATCH 2/2] Update to compare against generated code even with
 post-processing

---
 ...est_tir_transform_inject_ptx_async_copy.py | 42 +++++++++++++------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
index e15ef6402d6b..3543f798c36e 100644
--- a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
+++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
@@ -361,13 +361,27 @@ def postproc_if_missing_async_support():
     support_async = major >= 8
 
     func_name = "tvm_callback_cuda_postproc"
-    prev_postproc = None
-
-    if not support_async:
-        prev_postproc = tvm.get_global_func(func_name, allow_missing=True)
-
-        @tvm.register_func(func_name, override=True)
-        def tvm_callback_cuda_postproc(code, _):
+    prev_postproc = tvm.get_global_func(func_name, allow_missing=True)
+
+    # Store the generated code prior to the post-processing.  This
+    # way, even though the generated code doesn't compile on platforms
+    # that do not support async, the comparison against an expected
+    # output can still be performed.  We cannot use
+    # `mod.get_source()`, as that contains the source after all
+    # post-processing.
+    original_code = None
+
+    def get_original_code():
+        nonlocal original_code
+        return original_code
+
+    @tvm.register_func(func_name, override=True)
+    def tvm_callback_cuda_postproc(code, _):
+        nonlocal original_code
+        original_code = code
+        if support_async:
+            return code
+        else:
             ret = []
             for line in code.split("\n"):
                 ret.append(line)
@@ -377,10 +391,12 @@ def tvm_callback_cuda_postproc(code, _):
             ret.append("}")
             return "".join(ret)
 
-    yield
+    yield get_original_code
 
     # Restore previous postproc func to avoid impacting other tests
-    if prev_postproc is not None:
+    if prev_postproc is None:
+        tvm._ffi.registry.remove_global_func(func_name)
+    else:
         tvm.register_func(func_name, prev_postproc, override=True)
 
 
@@ -426,8 +442,8 @@ def simple_compute(
 
     mod = tvm.IRModule.from_expr(simple_compute)
     with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-        built = tvm.build(mod, target="cuda")
-    generated_code = built.imported_modules[0].get_source()
+        tvm.build(mod, target="cuda")
+    generated_code = postproc_if_missing_async_support()
     assert generated_code == expected_cuda_script
 
 
@@ -881,8 +897,8 @@ def complex_compute(
 
     mod = tvm.IRModule.from_expr(complex_compute)
     with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
-        built = tvm.build(mod, target="cuda")
-    generated_code = built.imported_modules[0].get_source()
+        tvm.build(mod, target="cuda")
+    generated_code = postproc_if_missing_async_support()
     # generated_code must contain "  setp.ne.b32 p, %0, 0;"
     assert "setp.ne.b32" in generated_code