Copy *tune_gemm* from triton-mlir branch to main_perf branch (#614)

* Copy *tune_gemm* from `triton-mlir` branch to `main_perf` branch The source commit in `triton-mlir` branch is the following one: ``` commit cf44637 Author: Lixun Zhang <[email protected]> Date: Tue Jul 23 14:22:01 2024 -0500 [tuning] gemm tuning script v3.3 (#606) ``` *tune_gemm* was copied from the source branch directory `scripts/amd/gemm` to the destination branch directory `python/perf-kernels/tune_gemm`. The SHA-256 hashes of *tune_gemm* files are the following ones: ``` 423aef1deb6c60f6578a1ecfc94d2473f8746b00d0368c553d31641fcfa5e354 README.md 46ab93978fee33f75df23332f12546dae7910478c391f08b7b1ebd415d8266b7 icache_flush.py f18711544641b810a652e6a6629bfa2b613f6ade87399e88fdf05b81d4af58a4 matmul.py 84a1c80ede36d3154e51188276eda2d2d0f52ed4f496ff69349c390d83b8ec10 matmul_kernel.py 2812b40183637bc8d7e47d283c7d66b1792134a43de76f3eacf7b9b3e1c2431a one_config.py 0ac09c33b0173cea06ddabbf9f4e3afa1816781dea4fdcce5894a7e7d6a80e19 rocprof_gemm.py 00eff41cf1c0bfc41d623e42b51706af67639fec76146741e2067d2a93e0148a utils/file_generator.py cb7afb773ccee835b00396cccf87e0d44fe513131161f031fae42453725b3c82 utils/utils.py 59f23811b660e49e566927853926a21f02a7014bb19c8ea67e6b382db6c59900 tune_gemm.py e787f35d750b869f113b3c01692f64243a9cb8a71a18ade2f0465f614f7284e4 tune_gemm.sh ``` The files were kept as-is despite `pre-commit` intentions to change them. After that, *tune_gemm* directory in code and documentation was fixed to reflect it's new location.
ROCm · Aug 13, 2024 · 11e4447 · 11e4447
1 parent 1d2e066
commit 11e4447
Show file tree

Hide file tree

Showing 10 changed files with 2,691 additions and 0 deletions.
diff --git a/python/perf-kernels/tune_gemm/README.md b/python/perf-kernels/tune_gemm/README.md
diff --git a/python/perf-kernels/tune_gemm/icache_flush.py b/python/perf-kernels/tune_gemm/icache_flush.py
@@ -0,0 +1,94 @@
+import ctypes
+import array
+import random
+import math
+
+# the hip module can be installed as
+# `python3 -m pip install -i https://test.pypi.org/simple hip-python~=$rocm_version`
+# more information about hip-python is at: https://github.com/ROCm/hip-python
+from hip import hip, hiprtc
+
+def hip_check(call_result):
+    err = call_result[0]
+    result = call_result[1:]
+    if len(result) == 1:
+        result = result[0]
+
+    if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
+        raise RuntimeError(str(err))
+    elif (
+        isinstance(err, hiprtc.hiprtcResult)
+        and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
+        ):
+        raise RuntimeError(str(err))
+
+    return result
+
+# S_ICACHE_INV Invalidate entire first level instruction cache.
+# There must be 16 separate S_NOP instructions or a jump/branch instruction
+# after this instruction to ensure the internal instruction buffers are also
+# invalidated.
+def gen_kernel():
+    source = b"""\
+        extern "C" __global__ void icache_flush_kernel() {
+              asm __volatile__("s_icache_inv");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+              asm __volatile__("s_nop 0");
+        }
+    """
+
+    # print(f"source = {source}")
+    prog = hip_check(hiprtc.hiprtcCreateProgram(source, b"icache_flush_kernel", 0, [], []))
+    progs = hip.hipDeviceProp_t()
+    hip_check(hip.hipGetDeviceProperties(progs, 0))
+    arch = progs.gcnArchName
+    cflags = [b"--offload-arch="+arch]
+    err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
+    if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
+        log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
+        log = bytearray(log_size)
+        hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
+        print(f"log = {log.decode()}, err = {err}")
+        raise RuntimeError(log.decode())
+
+    code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
+    code = bytearray(code_size)
+    hip_check(hiprtc.hiprtcGetCode(prog, code))
+    module = hip_check(hip.hipModuleLoadData(code))
+    kernel = hip_check(hip.hipModuleGetFunction(module, b"icache_flush_kernel"))
+
+    return kernel
+
+kernel = gen_kernel()
+progs = hip.hipDeviceProp_t()
+hip_check(hip.hipGetDeviceProperties(progs, 0))
+cu_num = progs.multiProcessorCount
+
+def icache_flush():
+    block = hip.dim3(x=64)
+    grid = hip.dim3(cu_num * 60)
+
+    hip_check(hip.hipModuleLaunchKernel(
+        kernel,
+        *grid,
+        *block,
+        sharedMemBytes=0,
+        stream=None,
+        kernelParams=None,
+        extra=()
+        )
+    )