Skip to content

Commit

Permalink
[LLVMGPU] Fix linear dim selection in GPUApplyTilingLevel (#17611)
Browse files Browse the repository at this point in the history
`GPUApplyTilingLevel` skips over dimensions with no tiling (tile size of
zero), and the linear dim mapping in the forall loop also skips the
indices of the dimensions. This PR fixes this so the linear dim mappings
always increment by 1 starting from 0.

Signed-off-by: Max Dawkins <[email protected]>
  • Loading branch information
Max191 authored Jun 7, 2024
1 parent 2baf6c3 commit f062b19
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,11 @@ applyTileAndFuseToEachRoot(RewriterBase &rewriter,
// TODO: Add some helpers to construct this based on the enum type rather
// than doing it here.
SmallVector<DeviceMappingAttrInterface> mapping;
for (auto [idx, size] : llvm::enumerate(tileSizes)) {
int idx = 0;
for (auto size : tileSizes) {
if (!isConstantIntValue(size, 0)) {
unsigned mappingId =
static_cast<unsigned>(gpu::MappingId::LinearDim0) + idx;
static_cast<unsigned>(gpu::MappingId::LinearDim0) + idx++;
mapping.push_back(gpu::GPUThreadMappingAttr::get(
context, static_cast<gpu::MappingId>(mappingId)));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,42 @@ module {

// -----

#config = #iree_gpu.lowering_config<{thread = [0, 16]}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @sequential_forall_mappings() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x256xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x256xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256xf32>> -> tensor<4x256xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256xf32>> -> tensor<4x256xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<4x256xf32>> -> tensor<4x256xf32>
%6 = linalg.generic {
indexing_maps = [#map, #map, #map],
iterator_types = ["parallel", "parallel"]
} ins(%3, %4 : tensor<4x256xf32>, tensor<4x256xf32>) outs(%5 : tensor<4x256xf32>) attrs = {lowering_config = #config} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%7 = arith.addf %in, %in_0 : f32
linalg.yield %7 : f32
} -> tensor<4x256xf32>
flow.dispatch.tensor.store %6, %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : tensor<4x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x256xf32>>
return
}
}

// Verify that no loops are generated without a reduction configuration.
// CHECK-LABEL: func.func @sequential_forall_mappings
// CHECK-NOT: scf.for

// THREAD-LABEL: func.func @sequential_forall_mappings
// THREAD: scf.forall ({{.*}}) = (0) to (256) step (16)
// THREAD: linalg.generic {{.*}} ins(%{{.*}}: tensor<4x16xf32>, tensor<4x16xf32>)
// THREAD: scf.forall.in_parallel
// THREAD: mapping = [#gpu.thread<linear_dim_0>]

// -----

module {
func.func @matmul_transpose_b() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [128, 2, 1] subgroup_size = 64>} {
%c4 = arith.constant 4 : index
Expand Down

0 comments on commit f062b19

Please sign in to comment.