iree-org · kuhar · Nov 12, 2024 · Nov 12, 2024
@@ -210,7 +210,7 @@ func.func @matmul_cleanup(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: tens
 module {
   func.func @inferred_add_tensor(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>, %5: tensor<64x256xf32>) -> tensor<64x256xf32>
       attributes {
-        translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
       } {
     %6 = linalg.generic {
       indexing_maps = [#map, #map, #map],
@@ -241,7 +241,7 @@ module {
 module {
   func.func @inferred_dynamic(%3: tensor<?x?xf32>, %4: tensor<?x?xf32>, %5: tensor<?x?xf32>) -> tensor<?x?xf32>
       attributes {
-        translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
       } {
     %6 = linalg.generic {
       indexing_maps = [#map, #map, #map],
@@ -271,7 +271,7 @@ module {
 module {
   func.func @inferred_small_inner_dim(%3: tensor<8x2xf32>, %4: tensor<8x2xf32>, %5: tensor<8x2xf32>) -> tensor<8x2xf32>
       attributes {
-        translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
       } {
     %6 = linalg.generic {
       indexing_maps = [#map, #map, #map],
@@ -298,7 +298,7 @@ module {
 module {
   func.func @inferred_small_inner_dim_fill_vector_sizes(%0: tensor<4x16x8x4x16x2x4xf16>, %1: tensor<4x16x8x4x16x2x4xf16>) -> tensor<4x16x8x4x16x2x4xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {}>
       } {
     %2 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config}
         ins(%0 : tensor<4x16x8x4x16x2x4xf16>)
@@ -321,7 +321,7 @@ module {
   func.func @inferred_small_inner_dim_dont_fill_non_contiguous(
     %0: tensor<4x16x4x4xf16>, %1: tensor<4x16x4x4xf16>) -> tensor<4x16x4x4xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
       } {
     %2 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config}
         ins(%0 : tensor<4x16x4x4xf16>)
@@ -343,7 +343,7 @@ module {
 module {
   func.func @inferred_unaligned(%0: tensor<70xf16>, %1: tensor<70xf16>) -> tensor<70xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
       } {
     %2 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config}
         ins(%0 : tensor<70xf16>)
@@ -365,7 +365,7 @@ module {
 module {
   func.func @inferred_smaller_load(%0: tensor<128xf16>, %1: tensor<128xf16>) -> tensor<128xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
       } {
     %2 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config}
         ins(%0 : tensor<128xf16>)
@@ -386,7 +386,7 @@ module {
 module {
   func.func @inferred_im2col(%2: tensor<2x34x34x128xf16>, %3: tensor<2x128x8xf16>) -> tensor<2x128x8xf16>
       attributes {
-        translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [16, 32, 1] subgroup_size = 64, {}>
       } {
     %4 = iree_linalg_ext.im2col {lowering_config = #config}
       strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
@@ -529,7 +529,7 @@ module {
 func.func @distribute_multi_result_generic(
   %arg0: tensor<3x4x5xf32>, %arg1: tensor<3x4xf32>, %arg2: tensor<3x4xf32>) -> (tensor<3x4x5xf32>, tensor<3x4x5xf32>)
       attributes {
-        translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 32, {}>
+        translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 32, {}>
       } {
   %empty = tensor.empty() : tensor<3x4x5xf32>
   %0:2 = linalg.generic {

@@ -8,7 +8,7 @@
 #map = affine_map<()[s0] -> (s0 * 256)>
 #map1 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
 #map2 = affine_map<(d0) -> (d0 * 4)>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [64, 1, 1]>
 func.func @add_tensor() attributes {translation_info = #translation} {
   %cst = arith.constant 0.000000e+00 : f32
   %c64 = arith.constant 64 : index
@@ -57,7 +57,7 @@ func.func @add_tensor() attributes {translation_info = #translation} {
 #map = affine_map<()[s0] -> (s0 * 256)>
 #map1 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
 #map2 = affine_map<(d0) -> (d0 * 4)>
-#translation = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1]>
 func.func @add_tensor_lane_id() attributes {translation_info = #translation} {
   %cst = arith.constant 0.000000e+00 : f32
   %c64 = arith.constant 64 : index

@@ -1,7 +1,7 @@
 // RUN: iree-opt %s --split-input-file --mlir-print-local-scope \
 // RUN:   --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-distribute-forall, canonicalize, cse))" | FileCheck %s
 
-#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
+#translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
 
 func.func @distribute_thread_forall(%out : memref<?xi32>)
     attributes {translation_info = #translation_info} {
@@ -24,7 +24,7 @@ func.func @distribute_thread_forall(%out : memref<?xi32>)
 
 // -----
 
-#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
+#translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
 
 func.func @distribute_warp_forall(%out : memref<?xi32>)
     attributes {translation_info = #translation_info} {
@@ -47,7 +47,7 @@ func.func @distribute_warp_forall(%out : memref<?xi32>)
 
 // -----
 
-#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
+#translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
 
 func.func @distribute_lane_forall(%out : memref<?xi32>)
     attributes {translation_info = #translation_info} {
@@ -64,7 +64,7 @@ func.func @distribute_lane_forall(%out : memref<?xi32>)
 
 // -----
 
-#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
+#translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
 
 func.func @distribute_thread_forall_drop_for_loop(%out : memref<?xi32>)
     attributes {translation_info = #translation_info} {
@@ -87,7 +87,7 @@ func.func @distribute_thread_forall_drop_for_loop(%out : memref<?xi32>)
 
 // -----
 
-#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
+#translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
 
 func.func @distribute_thread_forall_single_thread(%out : memref<?xi32>)
     attributes {translation_info = #translation_info} {
@@ -110,7 +110,7 @@ func.func @distribute_thread_forall_single_thread(%out : memref<?xi32>)
 
 // -----
 
-#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
+#translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
 
 func.func @distribute_thread_forall_multi_dim(%out : memref<?x?x?xi32>)
     attributes {translation_info = #translation_info} {
@@ -135,7 +135,7 @@ func.func @distribute_thread_forall_multi_dim(%out : memref<?x?x?xi32>)
 
 // -----
 
-#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [7, 1, 1] subgroup_size = 32>
+#translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [7, 1, 1] subgroup_size = 32>
 
 func.func @distribute_thread_forall_small_workgroup(%out : memref<?xi32>)
     attributes {translation_info = #translation_info} {

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-distribute-scf-for))" --mlir-print-local-scope %s | FileCheck %s
 // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-distribute-scf-for{use-block-dims=false}))" --mlir-print-local-scope %s | FileCheck --check-prefix=NO-BLOCK-DIM %s
 
-#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [64, 1, 1]>
 func.func @distribute_to_x(%lb : index, %ub : index, %step: index, %output: memref<?xf32>)
   attributes {translation_info = #translation} {
   %c0 = arith.constant 0 : index
@@ -37,7 +37,7 @@ func.func @distribute_to_x(%lb : index, %ub : index, %step: index, %output: memr
 
 // -----
 
-#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [1, 64, 1]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [1, 64, 1]>
 func.func @distribute_to_y(%lb : index, %ub : index, %step: index, %output: memref<?xf32>)
   attributes {translation_info = #translation} {
   %c0 = arith.constant 0 : index
@@ -64,7 +64,7 @@ func.func @distribute_to_y(%lb : index, %ub : index, %step: index, %output: memr
 
 // -----
 
-#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [1, 1, 64]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [1, 1, 64]>
 func.func @distribute_to_z(%lb : index, %ub : index, %step: index, %output: memref<?xf32>)
   attributes {translation_info = #translation} {
   %c0 = arith.constant 0 : index

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-codegen-gpu-distribute-shared-memory-copy, fold-memref-alias-ops, canonicalize, cse))' %s | FileCheck %s
 
 #executable_target = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 4, 1]>
+#translation_info = #iree_codegen.translation_info<pipeline = None workgroup_size = [32, 4, 1]>
 #map0 = affine_map<()[s0, s1, s2] -> (s0 * 4 + s1 * 128 + s2 * 512)>
 module {
   memref.global "private" @__shared_memory___1 : memref<3x512xf32, 3>
@@ -90,7 +90,7 @@ module {
 // -----
 
 #executable_target = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 8, 1]>
+#translation_info = #iree_codegen.translation_info<pipeline = None workgroup_size = [32, 8, 1]>
 module {
 
   func.func @unaligned_shared_memory_copy(
@@ -136,7 +136,7 @@ module {
 // -----
 
 #executable_target = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 8, 1]>
+#translation_info = #iree_codegen.translation_info<pipeline = None workgroup_size = [32, 8, 1]>
 module {
   func.func @zero_dim_shared_memory_copy(%global : memref<f32>, %shared : memref<f32>)
   attributes {hal.executable.target = #executable_target, translation_info = #translation_info} {
@@ -162,7 +162,7 @@ module {
 // -----
 
 #executable_target = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 8, 1]>
+#translation_info = #iree_codegen.translation_info<pipeline = None workgroup_size = [32, 8, 1]>
 module {
   func.func @zero_dim_shared_memory_copy(%A: memref<1x32x128xi4>, %B: memref<1x128xf32>, %C: memref<1x128xi4>,
                                          %SM: memref<1x32x128xf32, #gpu.address_space<workgroup>>)

@@ -5,7 +5,7 @@
 #map = affine_map<(d0, d1) -> (d0, d1)>
 func.func @simple_generic(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>, %5: tensor<64x256xf32>) -> tensor<64x256xf32>
     attributes {
-      translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
     } {
   %6 = linalg.generic {
     indexing_maps = [#map, #map, #map],
@@ -29,7 +29,7 @@ func.func @simple_generic(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>, %5: te
 #map = affine_map<(d0, d1) -> (d0, d1)>
 func.func @fuse_destination(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>) -> tensor<64x64xf32>
     attributes {
-      translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
     } {
   %empty = tensor.empty() : tensor<64x64xf32>
   %cst = arith.constant 0.0 : f32
@@ -50,7 +50,7 @@ func.func @fuse_destination(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>) -> ten
 
 func.func @in_nested_region(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: tensor<64x64xf32>) -> tensor<64x64xf32>
     attributes {
-      translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
     } {
   %c8 = arith.constant 8 : index
   %c64 = arith.constant 64 : index
@@ -77,7 +77,7 @@ func.func @in_nested_region(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: te
 
 func.func @do_not_redistribute_in_forall(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: tensor<64x64xf32>) -> tensor<64x64xf32>
     attributes {
-      translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
     } {
   %c8 = arith.constant 8 : index
   %c64 = arith.constant 64 : index
@@ -134,7 +134,7 @@ func.func @do_not_redistribute_in_forall(%3: tensor<64x64xf32>, %4: tensor<64x64
 
 func.func @multiple_use_tilable_op(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>) -> (tensor<64x256xf32>, tensor<256x64xf32>)
     attributes {
-      translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {}>
     } {
   %add_empty = tensor.empty() : tensor<64x256xf32>
   %6 = linalg.add

@@ -25,7 +25,7 @@
 ]>
 hal.executable private @main_dispatch_0 {
 hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
-  hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>, workgroup_size = [64 : index, 16 : index, 1 : index]} {
+  hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>, workgroup_size = [64 : index, 16 : index, 1 : index]} {
   ^bb0(%arg0: !hal.device):
     %c250 = arith.constant 250 : index
     %c500 = arith.constant 500 : index

@@ -230,7 +230,7 @@ func.func @weight_dequant_matmul() {
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-func.func @conv() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
+func.func @conv() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>>

@@ -8,7 +8,7 @@
 #config = #iree_codegen.lowering_config<tile_sizes = [[1, 256]]>
 #map = affine_map<()[s0] -> (s0 * 256)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [64, 1, 1]>
 module {
   func.func @add_tensor() attributes {translation_info = #translation} {
     %c0 = arith.constant 0 : index
@@ -64,7 +64,7 @@ module {
 #map = affine_map<()[s0] -> (s0 * 64)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0)>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [64, 1, 1]>
 module {
   func.func @reduction() attributes {translation_info = #translation} {
     %c0 = arith.constant 0 : index
@@ -120,7 +120,7 @@ module {
 #map = affine_map<()[s0] -> (s0 * 64)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [64, 1, 1]>
 module {
   func.func @reduction_broadcast() attributes {translation_info = #translation} {
     %c0 = arith.constant 0 : index