diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp index 6862aff9ddaa..acd189046c98 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp @@ -107,10 +107,11 @@ applyTileAndFuseToEachRoot(RewriterBase &rewriter, // TODO: Add some helpers to construct this based on the enum type rather // than doing it here. SmallVector mapping; - for (auto [idx, size] : llvm::enumerate(tileSizes)) { + int idx = 0; + for (auto size : tileSizes) { if (!isConstantIntValue(size, 0)) { unsigned mappingId = - static_cast(gpu::MappingId::LinearDim0) + idx; + static_cast(gpu::MappingId::LinearDim0) + idx++; mapping.push_back(gpu::GPUThreadMappingAttr::get( context, static_cast(mappingId))); } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir index 8c76a6e69230..ab39fc2ffa89 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir @@ -37,6 +37,42 @@ module { // ----- +#config = #iree_gpu.lowering_config<{thread = [0, 16]}> +#map = affine_map<(d0, d1) -> (d0, d1)> +module { + func.func @sequential_forall_mappings() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x256xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x256xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x256xf32> + %6 = linalg.generic { + indexing_maps = [#map, #map, #map], + iterator_types = ["parallel", "parallel"] + } ins(%3, %4 : tensor<4x256xf32>, tensor<4x256xf32>) outs(%5 : tensor<4x256xf32>) attrs = {lowering_config = #config} { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %7 = arith.addf %in, %in_0 : f32 + linalg.yield %7 : f32 + } -> tensor<4x256xf32> + flow.dispatch.tensor.store %6, %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : tensor<4x256xf32> -> !flow.dispatch.tensor> + return + } +} + +// Verify that no loops are generated without a reduction configuration. +// CHECK-LABEL: func.func @sequential_forall_mappings +// CHECK-NOT: scf.for + +// THREAD-LABEL: func.func @sequential_forall_mappings +// THREAD: scf.forall ({{.*}}) = (0) to (256) step (16) +// THREAD: linalg.generic {{.*}} ins(%{{.*}}: tensor<4x16xf32>, tensor<4x16xf32>) +// THREAD: scf.forall.in_parallel +// THREAD: mapping = [#gpu.thread] + +// ----- + module { func.func @matmul_transpose_b() attributes {translation_info = #iree_codegen.translation_info} { %c4 = arith.constant 4 : index