Skip to content

Commit

Permalink
Copy *tune_gemm* from triton-mlir branch to main_perf branch (#614)
Browse files Browse the repository at this point in the history
* Copy *tune_gemm* from `triton-mlir` branch to `main_perf` branch

The source commit in `triton-mlir` branch is the following one:
```
commit cf44637
Author: Lixun Zhang <[email protected]>
Date:   Tue Jul 23 14:22:01 2024 -0500

    [tuning] gemm tuning script v3.3 (#606)
```

*tune_gemm* was copied from the source branch directory `scripts/amd/gemm`
to the destination branch directory `python/perf-kernels/tune_gemm`.

The SHA-256 hashes of *tune_gemm* files are the following ones:
```
423aef1deb6c60f6578a1ecfc94d2473f8746b00d0368c553d31641fcfa5e354  README.md
46ab93978fee33f75df23332f12546dae7910478c391f08b7b1ebd415d8266b7  icache_flush.py
f18711544641b810a652e6a6629bfa2b613f6ade87399e88fdf05b81d4af58a4  matmul.py
84a1c80ede36d3154e51188276eda2d2d0f52ed4f496ff69349c390d83b8ec10  matmul_kernel.py
2812b40183637bc8d7e47d283c7d66b1792134a43de76f3eacf7b9b3e1c2431a  one_config.py
0ac09c33b0173cea06ddabbf9f4e3afa1816781dea4fdcce5894a7e7d6a80e19  rocprof_gemm.py
00eff41cf1c0bfc41d623e42b51706af67639fec76146741e2067d2a93e0148a  utils/file_generator.py
cb7afb773ccee835b00396cccf87e0d44fe513131161f031fae42453725b3c82  utils/utils.py
59f23811b660e49e566927853926a21f02a7014bb19c8ea67e6b382db6c59900  tune_gemm.py
e787f35d750b869f113b3c01692f64243a9cb8a71a18ade2f0465f614f7284e4  tune_gemm.sh
```

The files were kept as-is despite `pre-commit` intentions to change them.

After that, *tune_gemm* directory in code and documentation was fixed to reflect
it's new location.
  • Loading branch information
brunomazzottiamd authored Aug 13, 2024
1 parent 1d2e066 commit 11e4447
Show file tree
Hide file tree
Showing 10 changed files with 2,691 additions and 0 deletions.
316 changes: 316 additions & 0 deletions python/perf-kernels/tune_gemm/README.md

Large diffs are not rendered by default.

94 changes: 94 additions & 0 deletions python/perf-kernels/tune_gemm/icache_flush.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import ctypes
import array
import random
import math

# the hip module can be installed as
# `python3 -m pip install -i https://test.pypi.org/simple hip-python~=$rocm_version`
# more information about hip-python is at: https://github.com/ROCm/hip-python
from hip import hip, hiprtc

def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]

if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))

return result

# S_ICACHE_INV Invalidate entire first level instruction cache.
# There must be 16 separate S_NOP instructions or a jump/branch instruction
# after this instruction to ensure the internal instruction buffers are also
# invalidated.
def gen_kernel():
source = b"""\
extern "C" __global__ void icache_flush_kernel() {
asm __volatile__("s_icache_inv");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
asm __volatile__("s_nop 0");
}
"""

# print(f"source = {source}")
prog = hip_check(hiprtc.hiprtcCreateProgram(source, b"icache_flush_kernel", 0, [], []))
progs = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(progs, 0))
arch = progs.gcnArchName
cflags = [b"--offload-arch="+arch]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
print(f"log = {log.decode()}, err = {err}")
raise RuntimeError(log.decode())

code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"icache_flush_kernel"))

return kernel

kernel = gen_kernel()
progs = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(progs, 0))
cu_num = progs.multiProcessorCount

def icache_flush():
block = hip.dim3(x=64)
grid = hip.dim3(cu_num * 60)

hip_check(hip.hipModuleLaunchKernel(
kernel,
*grid,
*block,
sharedMemBytes=0,
stream=None,
kernelParams=None,
extra=()
)
)
Loading

0 comments on commit 11e4447

Please sign in to comment.