Skip to content

Commit

Permalink
Fixes non static check to handle both the attributes and input args
Browse files Browse the repository at this point in the history
  • Loading branch information
hmalgewatta committed Oct 15, 2024
1 parent ea0b756 commit 6be6f71
Show file tree
Hide file tree
Showing 3 changed files with 1 addition and 90 deletions.
6 changes: 0 additions & 6 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -401,15 +401,9 @@ jobs:
lit -v "${LIT_TEST_DIR}"
- name: Run python tests on HIP
run: |
<<<<<<< HEAD
INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
=======
SHARED_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/_C"
if [ ! -d "${SHARED_LIB_DIR}" ]; then
echo "Could not find '${SHARED_LIB_DIR}'" ; exit -1
>>>>>>> 5897e003 (Moves pytest, adds pytest to CI, verifies for static input args to view_slice)
fi
pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
pytest --capture=tee-sys -rfs third_party/amd/python/test/test_view_slice.py
Expand Down
83 changes: 0 additions & 83 deletions python/test/unit/language/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5831,11 +5831,9 @@ def kernel(
kernel[(1, )](x_tri, y_tri, shape[0], BLOCK_SIZE=shape[0])
# compare
np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01)
<<<<<<< HEAD


# -----------------------
<<<<<<< HEAD
# test loop unrolling
# -----------------------

Expand Down Expand Up @@ -5908,84 +5906,3 @@ def sanitize_cumsum_kernel(Z, X, BLOCK: tl.constexpr):
Z = torch.zeros_like(X)
sanitize_cumsum_kernel[(1, )](Z, X, BLOCK=BLOCK)
torch.testing.assert_close(Z, X.cumsum(0).to(torch.int32))
=======
# test view slice
# -----------------------

view_layout = [
BlockedLayout([1, 8], [16, 4], [4, 1], [1, 0], [1, 1], [1, 1], [0, 1]),
BlockedLayout([2, 2], [16, 4], [2, 2], [1, 0], [1, 1], [1, 1], [0, 1]),
BlockedLayout([2, 2], [16, 4], [2, 2], [0, 1], [1, 1], [1, 1], [0, 1]),
BlockedLayout([1, 8], [16, 4], [4, 1], [1, 0], [1, 1], [1, 1], [0, 1]),
BlockedLayout([1, 8], [16, 4], [4, 1], [0, 1], [1, 1], [1, 1], [0, 1]),
]
blocked_layout = [
BlockedLayout([1, 8], [16, 4], [4, 1], [1, 0], [1, 1], [1, 1], [0, 1]),
BlockedLayout([2, 2], [16, 4], [2, 2], [1, 0], [1, 1], [1, 1], [0, 1]),
BlockedLayout([2, 2], [16, 4], [2, 2], [0, 1], [1, 1], [1, 1], [0, 1]),
BlockedLayout([1, 8], [16, 4], [4, 1], [1, 0], [1, 1], [1, 1], [0, 1]),
BlockedLayout([1, 8], [16, 4], [4, 1], [0, 1], [1, 1], [1, 1], [0, 1]),
]


@pytest.mark.parametrize("M, N, M_tile_size, N_tile_size, M_tile_offset, N_tile_offset",
[[256, 256, 256, 32, 0, 32], [128, 128, 128, 64, 0, 64]])
@pytest.mark.parametrize("dtype", [torch.float16])
@pytest.mark.parametrize("view_layout", view_layout)
@pytest.mark.parametrize("blocked_layout", blocked_layout)
def test_view_slice(dtype, M, N, M_tile_size, N_tile_size, M_tile_offset, N_tile_offset, blocked_layout, view_layout,
device):
if not is_hip():
pytest.skip('view_slice is AMD specific instruction.')

ir = f"""
#blocked = {blocked_layout}
#view_layout = {view_layout}
""" + """
module attributes {"triton_gpu.num-ctas" = 1, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = """ + str(
64) + f""" : i32}} {{
tt.func public @kernel_0d1d(%arg0: !tt.ptr<f16> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<f16> {{tt.divisibility = 16 : i32}}) {{
%cst = arith.constant dense<{N}> : tensor<{M}x1xi32, #blocked>
%cst_n = arith.constant dense<{N_tile_size}> : tensor<{M_tile_size}x1xi32, #blocked>
%0 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #blocked}}>>
%42 = tt.make_range {{end = {M_tile_size} : i32, start = 0 : i32}} : tensor<{M_tile_size}xi32, #triton_gpu.slice<{{dim = 1, parent = #blocked}}>>
%1 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 0, parent = #blocked}}>>
%2 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<{M}x{N}x!tt.ptr<f16>, #blocked>
%4 = tt.expand_dims %0 {{axis = 1 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 1, parent = #blocked}}>> -> tensor<{M}x1xi32, #blocked>
%43 = tt.expand_dims %42 {{axis = 1 : i32}} : tensor<{M_tile_size}xi32, #triton_gpu.slice<{{dim = 1, parent = #blocked}}>> -> tensor<{M_tile_size}x1xi32, #blocked>
%5 = arith.muli %4, %cst : tensor<{M}x1xi32, #blocked>
%44 = arith.muli %43, %cst_n : tensor<{M_tile_size}x1xi32, #blocked>
%6 = tt.expand_dims %1 {{axis = 0 : i32}} : tensor<{M}xi32, #triton_gpu.slice<{{dim = 0, parent = #blocked}}>> -> tensor<1x{M}xi32, #blocked>
%7 = tt.broadcast %6 : tensor<1x{M}xi32, #blocked> -> tensor<{M}x{N}xi32, #blocked>
%8 = tt.broadcast %5 : tensor<{M}x1xi32, #blocked> -> tensor<{M}x{N}xi32, #blocked>
%9 = arith.addi %8, %7 : tensor<{M}x{N}xi32, #blocked>
%33 = tt.make_range {{end = {N_tile_size} : i32, start = 0 : i32}} : tensor<{N_tile_size}xi32, #triton_gpu.slice<{{dim = 0, parent = #blocked}}>>
%34 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<{M_tile_size}x{N_tile_size}x!tt.ptr<f16>, #blocked>
%37 = tt.expand_dims %33 {{axis = 0 : i32}} : tensor<{N_tile_size}xi32, #triton_gpu.slice<{{dim = 0, parent = #blocked}}>> -> tensor<1x{N_tile_size}xi32, #blocked>
%38 = tt.broadcast %37 : tensor<1x{N_tile_size}xi32, #blocked> -> tensor<{M_tile_size}x{N_tile_size}xi32, #blocked>
%39 = tt.broadcast %44 : tensor<{M_tile_size}x1xi32, #blocked> -> tensor<{M_tile_size}x{N_tile_size}xi32, #blocked>
%40 = arith.addi %38, %39 : tensor<{M_tile_size}x{N_tile_size}xi32, #blocked>
%10 = tt.addptr %2, %9 : tensor<{M}x{N}x!tt.ptr<f16>, #blocked>, tensor<{M}x{N}xi32, #blocked>
%11 = tt.load %10 {{cache = 1 : i32, evict = 1 : i32, isVolatile = false}} : tensor<{M}x{N}x!tt.ptr<f16>, #blocked>
%12 = triton_gpu.convert_layout %11 : tensor<{M}x{N}xf16, #blocked> -> tensor<{M}x{N}xf16, #view_layout>
%13 = amdgpu.view_slice %12[{M_tile_offset}, {N_tile_offset}] [{M_tile_size}, {N_tile_size}] [1, 1] : tensor<{M}x{N}xf16, #view_layout> to tensor<{M_tile_size}x{N_tile_size}xf16, #view_layout>
%14 = triton_gpu.convert_layout %13 : tensor<{M_tile_size}x{N_tile_size}xf16, #view_layout> -> tensor<{M_tile_size}x{N_tile_size}xf16, #blocked>
%15 = tt.addptr %34, %40 : tensor<{M_tile_size}x{N_tile_size}x!tt.ptr<f16>, #blocked>, tensor<{M_tile_size}x{N_tile_size}xi32, #blocked>
tt.store %15, %14 : tensor<{M_tile_size}x{N_tile_size}x!tt.ptr<f16>, #blocked>
tt.return
}}
}}
"""
x = torch.randn((M, N), device=device, dtype=torch.float16)
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f:
f.write(ir)
f.flush()
kernel = triton.compile(f.name)

view = torch.empty((M_tile_size, N_tile_size), device=device, dtype=torch.float16)

kernel[(1, 1, 1)](x.data_ptr(), view)
test_result = torch.eq(x[M_tile_offset:M_tile_size + M_tile_offset, N_tile_offset:N_tile_offset + N_tile_size],
view).all()
assert test_result
2 changes: 1 addition & 1 deletion third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

#include "llvm/ADT/TypeSwitch.h"
#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/OperationSupport.h"
#include "llvm/ADT/TypeSwitch.h"

#include "triton/Conversion/TritonGPUToLLVM/Utility.h"

Expand Down

0 comments on commit 6be6f71

Please sign in to comment.