From 22d8a8dca33ab53acad8f7dce6554bd2e01dfd7a Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 21 Jun 2023 09:19:59 -0500 Subject: [PATCH 1/2] [UnitTest][NVPTX] Avoid cascading failures from CUDA postproc Prior to this commit, the tests in `test_tir_transform_inject_ptx_async_copy.py` registered the `"tvm_callback_cuda_postproc"` function during pytest collection, and used a global variable to disable its functionality outside of the tests in this file. This had two major issues. First, if any other test also installs a postproc function, these postproc function required by the NVPTX tests would be overwritten. Second, if one of the NTPTX tests fails, the global variable controlling the postproc function would not be reset, causing any subsequent CUDA-related tests to also fail. This commit updates these NVPTX tests to conditionally install the postproc function, to de-register it after the test instead of disabling its functionality, and to de-register it regardless of the test result. This issue was initially found when debugging https://github.com/apache/tvm/pull/15103, when a failure in `test_tir_transform_inject_ptx_async_copy.py::test_cp_async_in_if_then_else` caused failures in 32 unrelated tests ([CI link](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm-gpu/detail/PR-15103/7/tests)). --- ...est_tir_transform_inject_ptx_async_copy.py | 83 ++++++++----------- 1 file changed, 36 insertions(+), 47 deletions(-) diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py index 1e1ef410b4e1..e15ef6402d6b 100644 --- a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py +++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py @@ -14,11 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import numpy as np + import tvm import tvm.testing from tvm.script import tir as T +import pytest +import numpy as np + def count_cp_async(stmt): num_alloc = [0] @@ -351,36 +354,38 @@ def test_inject_async_copy_shared_dyn(): """ -generated_code = "" -support_async = True +@pytest.fixture +def postproc_if_missing_async_support(): + arch = tvm.contrib.nvcc.get_target_compute_version() + major, _ = tvm.contrib.nvcc.parse_compute_version(arch) + support_async = major >= 8 + func_name = "tvm_callback_cuda_postproc" + prev_postproc = None -@tvm.register_func -def tvm_callback_cuda_postproc(code, _): - global generated_code - global support_async - generated_code = code - # return a dummy code so that device < sm80 could build correctly if not support_async: - ret = "" - for line in code.split("\n"): - ret += line + "\n" - if line.startswith('extern "C" __global__'): - break - ret += "}" - return ret - return code + prev_postproc = tvm.get_global_func(func_name, allow_missing=True) + @tvm.register_func(func_name, override=True) + def tvm_callback_cuda_postproc(code, _): + ret = [] + for line in code.split("\n"): + ret.append(line) + ret.append("\n") + if line.startswith('extern "C" __global__') and line.endswith("{"): + break + ret.append("}") + return "".join(ret) -@tvm.testing.requires_cuda -def test_cp_async_in_if_then_else(): - global support_async - arch = tvm.contrib.nvcc.get_target_compute_version() - major, _ = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # At least sm80 is required - support_async = False + yield + + # Restore previous postproc func to avoid impacting other tests + if prev_postproc is not None: + tvm.register_func(func_name, prev_postproc, override=True) + +@tvm.testing.requires_cuda +def test_cp_async_in_if_then_else(postproc_if_missing_async_support): @T.prim_func def simple_compute( A: T.Buffer((16, 14), "float32"), @@ -421,23 +426,13 @@ def simple_compute( mod = tvm.IRModule.from_expr(simple_compute) with tvm.transform.PassContext(config={"tir.use_async_copy": 1}): - tvm.build(mod, target="cuda") + built = tvm.build(mod, target="cuda") + generated_code = built.imported_modules[0].get_source() assert generated_code == expected_cuda_script - if not support_async: - # avoid return dummy code to other tests - support_async = True - @tvm.testing.requires_cuda -def test_vectorize_cp_async_in_if_then_else(): - global support_async - arch = tvm.contrib.nvcc.get_target_compute_version() - major, _ = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # At least sm80 is required - support_async = False - +def test_vectorize_cp_async_in_if_then_else(postproc_if_missing_async_support): @T.prim_func def complex_compute( A: T.Buffer((2, 16, 16, 1280), "float16"), @@ -886,17 +881,11 @@ def complex_compute( mod = tvm.IRModule.from_expr(complex_compute) with tvm.transform.PassContext(config={"tir.use_async_copy": 1}): - tvm.build(mod, target="cuda") + built = tvm.build(mod, target="cuda") + generated_code = built.imported_modules[0].get_source() # generated_code must contain " setp.ne.b32 p, %0, 0;" assert "setp.ne.b32" in generated_code - if not support_async: - # avoid return dummy code to other tests - support_async = True - if __name__ == "__main__": - test_inject_async_copy() - test_inject_async_copy_shared_dyn() - test_cp_async_in_if_then_else() - test_vectorize_cp_async_in_if_then_else() + tvm.testing.main() From f302f3ad7cbe1e6c9554415a9f0eeaa929804603 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 4 Jul 2023 09:41:57 -0500 Subject: [PATCH 2/2] Update to compare against generated code even with post-processing --- ...est_tir_transform_inject_ptx_async_copy.py | 42 +++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py index e15ef6402d6b..3543f798c36e 100644 --- a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py +++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py @@ -361,13 +361,27 @@ def postproc_if_missing_async_support(): support_async = major >= 8 func_name = "tvm_callback_cuda_postproc" - prev_postproc = None - - if not support_async: - prev_postproc = tvm.get_global_func(func_name, allow_missing=True) - - @tvm.register_func(func_name, override=True) - def tvm_callback_cuda_postproc(code, _): + prev_postproc = tvm.get_global_func(func_name, allow_missing=True) + + # Store the generated code prior to the post-processing. This + # way, even though the generated code doesn't compile on platforms + # that do not support async, the comparison against an expected + # output can still be performed. We cannot use + # `mod.get_source()`, as that contains the source after all + # post-processing. + original_code = None + + def get_original_code(): + nonlocal original_code + return original_code + + @tvm.register_func(func_name, override=True) + def tvm_callback_cuda_postproc(code, _): + nonlocal original_code + original_code = code + if support_async: + return code + else: ret = [] for line in code.split("\n"): ret.append(line) @@ -377,10 +391,12 @@ def tvm_callback_cuda_postproc(code, _): ret.append("}") return "".join(ret) - yield + yield get_original_code # Restore previous postproc func to avoid impacting other tests - if prev_postproc is not None: + if prev_postproc is None: + tvm._ffi.registry.remove_global_func(func_name) + else: tvm.register_func(func_name, prev_postproc, override=True) @@ -426,8 +442,8 @@ def simple_compute( mod = tvm.IRModule.from_expr(simple_compute) with tvm.transform.PassContext(config={"tir.use_async_copy": 1}): - built = tvm.build(mod, target="cuda") - generated_code = built.imported_modules[0].get_source() + tvm.build(mod, target="cuda") + generated_code = postproc_if_missing_async_support() assert generated_code == expected_cuda_script @@ -881,8 +897,8 @@ def complex_compute( mod = tvm.IRModule.from_expr(complex_compute) with tvm.transform.PassContext(config={"tir.use_async_copy": 1}): - built = tvm.build(mod, target="cuda") - generated_code = built.imported_modules[0].get_source() + tvm.build(mod, target="cuda") + generated_code = postproc_if_missing_async_support() # generated_code must contain " setp.ne.b32 p, %0, 0;" assert "setp.ne.b32" in generated_code